DataCamp offer interactive courses related to R Programming. While some is review, it is helpful to see other perspectives on material. As well, DataCamp has some interesting materials on packages that I want to learn better (ggplot2, dplyr, ggvis, etc.). This document summarizes a few key insights from:
The original DataCamp_Insights_v001 and DataCamp_Insights_v002 documents have been split, with this document containing the portions on Statistics:
Chapter 1 - Language of Data
Examining the “High School and Beyond” data frame - one observation per row, one variable per column:
Types of variables - take note of the dimensions first:
Categorical data in R - factors:
Discretize a variable - convert numerical variable to categorical variable:
Visualizing numerical data - good first step of any exploratory data analysis (picture is worth 1000 words):
Example code includes:
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.2.5
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.2.5
# Load data
data(email50, package="openintro")
# View its structure
str(email50)
## 'data.frame': 50 obs. of 21 variables:
## $ spam : num 0 0 1 0 0 0 0 0 0 0 ...
## $ to_multiple : num 0 0 0 0 0 0 0 0 0 0 ...
## $ from : num 1 1 1 1 1 1 1 1 1 1 ...
## $ cc : int 0 0 4 0 0 0 0 0 1 0 ...
## $ sent_email : num 1 0 0 0 0 0 0 1 1 0 ...
## $ time : POSIXct, format: "2012-01-04 07:19:16" "2012-02-16 14:10:06" ...
## $ image : num 0 0 0 0 0 0 0 0 0 0 ...
## $ attach : num 0 0 2 0 0 0 0 0 0 0 ...
## $ dollar : num 0 0 0 0 9 0 0 0 0 23 ...
## $ winner : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ inherit : num 0 0 0 0 0 0 0 0 0 0 ...
## $ viagra : num 0 0 0 0 0 0 0 0 0 0 ...
## $ password : num 0 0 0 0 1 0 0 0 0 0 ...
## $ num_char : num 21.705 7.011 0.631 2.454 41.623 ...
## $ line_breaks : int 551 183 28 61 1088 5 17 88 242 578 ...
## $ format : num 1 1 0 0 1 0 0 1 1 1 ...
## $ re_subj : num 1 0 0 0 0 0 0 1 1 0 ...
## $ exclaim_subj: num 0 0 0 0 0 0 0 0 1 0 ...
## $ urgent_subj : num 0 0 0 0 0 0 0 0 0 0 ...
## $ exclaim_mess: num 8 1 2 1 43 0 0 2 22 3 ...
## $ number : Factor w/ 3 levels "none","small",..: 2 3 1 2 2 2 2 2 2 2 ...
# Glimpse email50
glimpse(email50)
## Observations: 50
## Variables: 21
## $ spam <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0...
## $ to_multiple <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0...
## $ from <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ cc <int> 0, 0, 4, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0...
## $ sent_email <dbl> 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1...
## $ time <dttm> 2012-01-04 07:19:16, 2012-02-16 14:10:06, 2012-0...
## $ image <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ attach <dbl> 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0...
## $ dollar <dbl> 0, 0, 0, 0, 9, 0, 0, 0, 0, 23, 4, 0, 3, 2, 0, 0, ...
## $ winner <fctr> no, no, no, no, no, no, no, no, no, no, no, no, ...
## $ inherit <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ viagra <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ password <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0...
## $ num_char <dbl> 21.705, 7.011, 0.631, 2.454, 41.623, 0.057, 0.809...
## $ line_breaks <int> 551, 183, 28, 61, 1088, 5, 17, 88, 242, 578, 1167...
## $ format <dbl> 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1...
## $ re_subj <dbl> 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1...
## $ exclaim_subj <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0...
## $ urgent_subj <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ exclaim_mess <dbl> 8, 1, 2, 1, 43, 0, 0, 2, 22, 3, 13, 1, 2, 2, 21, ...
## $ number <fctr> small, big, none, small, small, small, small, sm...
# Subset of emails with big numbers: email50_big
email50_big <- email50 %>%
filter(number == "big")
# Glimpse the subset
glimpse(email50_big)
## Observations: 7
## Variables: 21
## $ spam <dbl> 0, 0, 1, 0, 0, 0, 0
## $ to_multiple <dbl> 0, 0, 0, 0, 0, 0, 0
## $ from <dbl> 1, 1, 1, 1, 1, 1, 1
## $ cc <int> 0, 0, 0, 0, 0, 0, 0
## $ sent_email <dbl> 0, 0, 0, 0, 0, 1, 0
## $ time <dttm> 2012-02-16 14:10:06, 2012-02-04 17:26:09, 2012-0...
## $ image <dbl> 0, 0, 0, 0, 0, 0, 0
## $ attach <dbl> 0, 0, 0, 0, 0, 0, 0
## $ dollar <dbl> 0, 0, 3, 2, 0, 0, 0
## $ winner <fctr> no, no, yes, no, no, no, no
## $ inherit <dbl> 0, 0, 0, 0, 0, 0, 0
## $ viagra <dbl> 0, 0, 0, 0, 0, 0, 0
## $ password <dbl> 0, 2, 0, 0, 0, 0, 8
## $ num_char <dbl> 7.011, 10.368, 42.793, 26.520, 6.563, 11.223, 10.613
## $ line_breaks <int> 183, 198, 712, 692, 140, 512, 225
## $ format <dbl> 1, 1, 1, 1, 1, 1, 1
## $ re_subj <dbl> 0, 0, 0, 0, 0, 0, 0
## $ exclaim_subj <dbl> 0, 0, 0, 1, 0, 0, 0
## $ urgent_subj <dbl> 0, 0, 0, 0, 0, 0, 0
## $ exclaim_mess <dbl> 1, 1, 2, 7, 2, 9, 9
## $ number <fctr> big, big, big, big, big, big, big
# Table of number variable
table(email50_big$number)
##
## none small big
## 0 0 7
# Drop levels
email50_big$number <- droplevels(email50_big$number)
# Another table of number variable
table(email50_big$number)
##
## big
## 7
# Calculate median number of characters: med_num_char
# Note that wrapping in () also prints the variable
(med_num_char <- median(email50$num_char))
## [1] 6.8895
# Create num_char_cat variable in email50
email50 <- email50 %>%
mutate(num_char_cat = ifelse(num_char < med_num_char, "below median", "at or above median"))
# Count emails in each category
table(email50$num_char_cat)
##
## at or above median below median
## 25 25
# Create number_yn column in email50
email50 <- email50 %>%
mutate(number_yn = ifelse(number == "none", "no", "yes"))
# Visualize number_yn
ggplot(email50, aes(x = number_yn)) +
geom_bar()
# Scatterplot of exclaim_mess vs. num_char
ggplot(email50, aes(x = num_char, y = exclaim_mess, color = factor(spam))) +
geom_point()
Chapter 2 - Study Types and Cautions
Observational studies and experiments - study types, and scopes of inferences:
Random sampling and random assignment:
Simpson’s paradox - when a confounder interferes with understanding response (y) variables and exlanatory (x1, x2, etc.) variables:
Example code includes:
# Load data
data(gapminder, package="gapminder")
# Glimpse data
glimpse(gapminder)
## Observations: 1,704
## Variables: 6
## $ country <fctr> Afghanistan, Afghanistan, Afghanistan, Afghanistan,...
## $ continent <fctr> Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asi...
## $ year <int> 1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992...
## $ lifeExp <dbl> 28.801, 30.332, 31.997, 34.020, 36.088, 38.438, 39.8...
## $ pop <int> 8425333, 9240934, 10267083, 11537966, 13079460, 1488...
## $ gdpPercap <dbl> 779.4453, 820.8530, 853.1007, 836.1971, 739.9811, 78...
# Identify type of study
type_of_study <- "observational"
dfUCB <- as.data.frame(UCBAdmissions)
ucb_admit <- data.frame(Admit=factor(rep(dfUCB$Admit, times=dfUCB$Freq)),
Gender=factor(rep(dfUCB$Gender, times=dfUCB$Freq)),
Dept=as.character(rep(dfUCB$Dept, times=dfUCB$Freq)),
stringsAsFactors=FALSE
)
str(ucb_admit)
## 'data.frame': 4526 obs. of 3 variables:
## $ Admit : Factor w/ 2 levels "Admitted","Rejected": 1 1 1 1 1 1 1 1 1 1 ...
## $ Gender: Factor w/ 2 levels "Male","Female": 1 1 1 1 1 1 1 1 1 1 ...
## $ Dept : chr "A" "A" "A" "A" ...
# Count number of male and female applicants admitted
ucb_counts <- ucb_admit %>%
count(Admit, Gender)
# View result
ucb_counts
## Source: local data frame [4 x 3]
## Groups: Admit [?]
##
## Admit Gender n
## <fctr> <fctr> <int>
## 1 Admitted Male 1198
## 2 Admitted Female 557
## 3 Rejected Male 1493
## 4 Rejected Female 1278
# Spread the output across columns
ucb_counts %>%
tidyr::spread(Admit, n)
## # A tibble: 2 × 3
## Gender Admitted Rejected
## * <fctr> <int> <int>
## 1 Male 1198 1493
## 2 Female 557 1278
ucb_admit %>%
# Table of counts of admission status and gender
count(Admit, Gender) %>%
# Spread output across columns based on admission status
tidyr::spread(Admit, n) %>%
# Create new variable
mutate(Perc_Admit = Admitted / (Admitted + Rejected))
## # A tibble: 2 × 4
## Gender Admitted Rejected Perc_Admit
## <fctr> <int> <int> <dbl>
## 1 Male 1198 1493 0.4451877
## 2 Female 557 1278 0.3035422
# Table of counts of admission status and gender for each department
admit_by_dept <- ucb_admit %>%
count(Dept, Gender, Admit) %>%
tidyr::spread(Admit, n)
# View result
admit_by_dept
## Source: local data frame [12 x 4]
## Groups: Dept, Gender [12]
##
## Dept Gender Admitted Rejected
## * <chr> <fctr> <int> <int>
## 1 A Male 512 313
## 2 A Female 89 19
## 3 B Male 353 207
## 4 B Female 17 8
## 5 C Male 120 205
## 6 C Female 202 391
## 7 D Male 138 279
## 8 D Female 131 244
## 9 E Male 53 138
## 10 E Female 94 299
## 11 F Male 22 351
## 12 F Female 24 317
# Percentage of males admitted for each department
admit_by_dept %>%
mutate(Perc_Admit = Admitted / (Admitted + Rejected))
## Source: local data frame [12 x 5]
## Groups: Dept, Gender [12]
##
## Dept Gender Admitted Rejected Perc_Admit
## <chr> <fctr> <int> <int> <dbl>
## 1 A Male 512 313 0.62060606
## 2 A Female 89 19 0.82407407
## 3 B Male 353 207 0.63035714
## 4 B Female 17 8 0.68000000
## 5 C Male 120 205 0.36923077
## 6 C Female 202 391 0.34064081
## 7 D Male 138 279 0.33093525
## 8 D Female 131 244 0.34933333
## 9 E Male 53 138 0.27748691
## 10 E Female 94 299 0.23918575
## 11 F Male 22 351 0.05898123
## 12 F Female 24 317 0.07038123
Chapter 3 - Sampling Strategies and Experimental Design
Sampling strategies:
Sampling in R:
Principles of experimental design:
Example code includes:
usrState <- "Connecticut ; Maine ; Massachusetts ; New Hampshire ; Rhode Island ; Vermont ; New Jersey ; New York ; Pennsylvania ; Illinois ; Indiana ; Michigan ; Ohio ; Wisconsin ; Iowa ; Kansas ; Minnesota ; Missouri ; Nebraska ; North Dakota ; South Dakota ; Delaware ; Florida ; Georgia ; Maryland ; North Carolina ; South Carolina ; Virginia ; District of Columbia ; West Virginia ; Alabama ; Kentucky ; Mississippi ; Tennessee ; Arkansas ; Louisiana ; Oklahoma ; Texas ; Arizona ; Colorado ; Idaho ; Montana ; Nevada ; New Mexico ; Utah ; Wyoming ; Alaska ; California ; Hawaii ; Oregon ; Washington"
usrRegion <- "Northeast ; Northeast ; Northeast ; Northeast ; Northeast ; Northeast ; Northeast ; Northeast ; Northeast ; Midwest ; Midwest ; Midwest ; Midwest ; Midwest ; Midwest ; Midwest ; Midwest ; Midwest ; Midwest ; Midwest ; Midwest ; South ; South ; South ; South ; South ; South ; South ; South ; South ; South ; South ; South ; South ; South ; South ; South ; South ; West ; West ; West ; West ; West ; West ; West ; West ; West ; West ; West ; West ; West"
us_regions <- data.frame(state=factor(strsplit(usrState, " ; ")[[1]]),
region=factor(strsplit(usrRegion, " ; ")[[1]])
)
# Simple random sample: states_srs
states_srs <- us_regions %>%
dplyr::sample_n(size=8)
# Count states by region
states_srs %>%
group_by(region) %>%
count()
## # A tibble: 4 × 2
## region n
## <fctr> <int>
## 1 Midwest 1
## 2 Northeast 2
## 3 South 3
## 4 West 2
# Stratified sample
states_str <- us_regions %>%
group_by(region) %>%
dplyr::sample_n(size=2)
# Count states by region
states_str %>%
group_by(region) %>%
count()
## # A tibble: 4 × 2
## region n
## <fctr> <int>
## 1 Midwest 2
## 2 Northeast 2
## 3 South 2
## 4 West 2
Chapter 4 - Case Study
Data will be from a study titled “Beauty in the Classroom”:
Variables in the data:
Example code includes:
# NEED DATASET
evStudents <- "43 ; 125 ; 125 ; 123 ; 20 ; 40 ; 44 ; 55 ; 195 ; 46 ; 27 ; 25 ; 20 ; 25 ; 42 ; 20 ; 18 ; 48 ; 44 ; 48 ; 45 ; 59 ; 87 ; 282 ; 292 ; 130 ; 285 ; 272 ; 286 ; 302 ; 41 ; 34 ; 41 ; 41 ; 34 ; 41 ; 22 ; 21 ; 17 ; 30 ; 23 ; 20 ; 60 ; 33 ; 44 ; 49 ; 29 ; 48 ; 40 ; 19 ; 16 ; 15 ; 23 ; 11 ; 29 ; 21 ; 18 ; 19 ; 20 ; 25 ; 33 ; 24 ; 34 ; 21 ; 30 ; 25 ; 35 ; 40 ; 30 ; 42 ; 57 ; 57 ; 51 ; 30 ; 36 ; 37 ; 29 ; 27 ; 28 ; 52 ; 26 ; 30 ; 33 ; 177 ; 199 ; 32 ; 37 ; 161 ; 41 ; 44 ; 53 ; 49 ; 32 ; 135 ; 33 ; 19 ; 111 ; 149 ; 27 ; 136 ; 140 ; 31 ; 15 ; 29 ; 25 ; 18 ; 45 ; 15 ; 38 ; 15 ; 28 ; 23 ; 19 ; 23 ; 22 ; 20 ; 19 ; 23 ; 22 ; 15 ; 22 ; 31 ; 21 ; 36 ; 19 ; 37 ; 26 ; 39 ; 184 ; 50 ; 157 ; 164 ; 24 ; 68 ; 47 ; 14 ; 15 ; 24 ; 39 ; 26 ; 40 ; 159 ; 151 ; 47 ; 122 ; 45 ; 16 ; 23 ; 16 ; 18 ; 16 ; 15 ; 28 ; 17 ; 13 ; 21 ; 17 ; 134 ; 48 ; 64 ; 69 ; 12 ; 43 ; 14 ; 15 ; 18 ; 16 ; 10 ; 47 ; 15 ; 14 ; 12 ; 246 ; 316 ; 15 ; 15 ; 29 ; 21 ; 8 ; 16 ; 26 ; 10 ; 26 ; 26 ; 26 ; 21 ; 12 ; 27 ; 27 ; 25 ; 15 ; 15 ; 17 ; 55 ; 48 ; 21 ; 39 ; 27 ; 14 ; 26 ; 16 ; 16 ; 13 ; 14 ; 17 ; 13 ; 15 ; 10 ; 34 ; 16 ; 14 ; 12 ; 39 ; 35 ; 45 ; 45 ; 17 ; 14 ; 14 ; 14 ; 12 ; 15 ; 51 ; 23 ; 57 ; 50 ; 24 ; 23 ; 23 ; 28 ; 45 ; 42 ; 57 ; 27 ; 38 ; 22 ; 43 ; 31 ; 13 ; 15 ; 34 ; 19 ; 20 ; 23 ; 27 ; 32 ; 21 ; 24 ; 21 ; 28 ; 29 ; 67 ; 89 ; 82 ; 122 ; 131 ; 114 ; 149 ; 23 ; 98 ; 27 ; 30 ; 30 ; 69 ; 15 ; 10 ; 11 ; 14 ; 11 ; 14 ; 77 ; 41 ; 88 ; 78 ; 65 ; 157 ; 68 ; 67 ; 80 ; 137 ; 69 ; 91 ; 80 ; 90 ; 34 ; 73 ; 44 ; 36 ; 20 ; 35 ; 248 ; 168 ; 247 ; 22 ; 103 ; 62 ; 82 ; 51 ; 35 ; 34 ; 37 ; 14 ; 266 ; 254 ; 13 ; 282 ; 17 ; 19 ; 42 ; 27 ; 16 ; 19 ; 86 ; 29 ; 88 ; 98 ; 44 ; 65 ; 63 ; 75 ; 43 ; 80 ; 52 ; 48 ; 66 ; 100 ; 11 ; 16 ; 22 ; 11 ; 10 ; 16 ; 16 ; 10 ; 32 ; 10 ; 16 ; 67 ; 22 ; 28 ; 30 ; 15 ; 13 ; 18 ; 26 ; 30 ; 14 ; 24 ; 22 ; 25 ; 26 ; 22 ; 26 ; 20 ; 22 ; 21 ; 21 ; 69 ; 65 ; 62 ; 67 ; 40 ; 45 ; 574 ; 579 ; 537 ; 581 ; 527 ; 87 ; 84 ; 79 ; 92 ; 24 ; 67 ; 103 ; 190 ; 68 ; 60 ; 64 ; 31 ; 62 ; 37 ; 13 ; 13 ; 15 ; 79 ; 13 ; 98 ; 97 ; 11 ; 78 ; 56 ; 20 ; 17 ; 20 ; 19 ; 26 ; 14 ; 18 ; 12 ; 19 ; 16 ; 16 ; 12 ; 17 ; 15 ; 16 ; 17 ; 21 ; 17 ; 10 ; 17 ; 17 ; 18 ; 16 ; 26 ; 18 ; 20 ; 17 ; 21 ; 21 ; 20 ; 20 ; 13 ; 16 ; 17 ; 18 ; 24 ; 20 ; 120 ; 155 ; 38 ; 70 ; 149 ; 137 ; 29 ; 55 ; 136 ; 96 ; 60 ; 108 ; 39 ; 15 ; 111 ; 17 ; 19 ; 27 ; 19 ; 13 ; 19 ; 22 ; 20 ; 27 ; 132 ; 127 ; 85 ; 101 ; 21 ; 86 ; 84 ; 67 ; 66 ; 35"
evScore <- "4.7 ; 4.1 ; 3.9 ; 4.8 ; 4.6 ; 4.3 ; 2.8 ; 4.1 ; 3.4 ; 4.5 ; 3.8 ; 4.5 ; 4.6 ; 3.9 ; 3.9 ; 4.3 ; 4.5 ; 4.8 ; 4.6 ; 4.6 ; 4.9 ; 4.6 ; 4.5 ; 4.4 ; 4.6 ; 4.7 ; 4.5 ; 4.8 ; 4.9 ; 4.5 ; 4.4 ; 4.3 ; 4.1 ; 4.2 ; 3.5 ; 3.4 ; 4.5 ; 4.4 ; 4.4 ; 2.5 ; 4.3 ; 4.5 ; 4.8 ; 4.8 ; 4.4 ; 4.7 ; 4.4 ; 4.7 ; 4.5 ; 4 ; 4.3 ; 4.4 ; 4.5 ; 5 ; 4.9 ; 4.6 ; 5 ; 4.7 ; 5 ; 3.6 ; 3.7 ; 4.3 ; 4.1 ; 4.2 ; 4.7 ; 4.7 ; 3.5 ; 4.1 ; 4.2 ; 4 ; 4 ; 3.9 ; 4.4 ; 3.8 ; 3.5 ; 4.2 ; 3.5 ; 3.6 ; 2.9 ; 3.3 ; 3.3 ; 3.2 ; 4.6 ; 4.2 ; 4.3 ; 4.4 ; 4.1 ; 4.6 ; 4.4 ; 4.8 ; 4.3 ; 3.6 ; 4.3 ; 4 ; 4.2 ; 4.1 ; 4.1 ; 4.4 ; 4.3 ; 4.4 ; 4.4 ; 4.9 ; 5 ; 4.4 ; 4.8 ; 4.9 ; 4.3 ; 5 ; 4.7 ; 4.5 ; 3.5 ; 3.9 ; 4 ; 4 ; 3.7 ; 3.4 ; 3.3 ; 3.8 ; 3.9 ; 3.4 ; 3.7 ; 4.1 ; 3.7 ; 3.5 ; 3.5 ; 4.4 ; 3.4 ; 4.3 ; 3.7 ; 4.7 ; 3.9 ; 3.6 ; 4.5 ; 4.5 ; 4.8 ; 4.8 ; 4.7 ; 4.5 ; 4.3 ; 4.8 ; 4.1 ; 4.4 ; 4.3 ; 3.6 ; 4.5 ; 4.3 ; 4.4 ; 4.7 ; 4.8 ; 3.5 ; 3.8 ; 3.6 ; 4.2 ; 3.6 ; 4.4 ; 3.7 ; 4.3 ; 4.6 ; 4.6 ; 4.1 ; 3.6 ; 2.3 ; 4.3 ; 4.4 ; 3.6 ; 4.4 ; 3.9 ; 3.8 ; 3.4 ; 4.9 ; 4.1 ; 3.2 ; 4.2 ; 3.9 ; 4.9 ; 4.7 ; 4.4 ; 4.2 ; 4 ; 4.4 ; 3.9 ; 4.4 ; 3 ; 3.5 ; 2.8 ; 4.6 ; 4.3 ; 3.4 ; 3 ; 4.2 ; 4.3 ; 4.1 ; 4.6 ; 3.9 ; 3.5 ; 4 ; 4 ; 3.9 ; 3.3 ; 4 ; 3.8 ; 4.2 ; 4 ; 3.8 ; 3.3 ; 4.1 ; 4.7 ; 4.4 ; 4.8 ; 4.8 ; 4.6 ; 4.6 ; 4.8 ; 4.4 ; 4.7 ; 4.7 ; 3.3 ; 4.4 ; 4.3 ; 4.9 ; 4.4 ; 4.7 ; 4.3 ; 4.8 ; 4.5 ; 4.7 ; 3.3 ; 4.7 ; 4.6 ; 3.6 ; 4 ; 4.1 ; 4 ; 4.5 ; 4.6 ; 4.8 ; 4.6 ; 4.9 ; 3.1 ; 3.7 ; 3.7 ; 3.9 ; 3.9 ; 3.2 ; 4.4 ; 4.2 ; 4.7 ; 3.9 ; 3.6 ; 3.4 ; 4.4 ; 4.4 ; 4.1 ; 3.6 ; 3.5 ; 4.1 ; 3.8 ; 4 ; 4.8 ; 4.2 ; 4.6 ; 4.3 ; 4.8 ; 3.8 ; 4.5 ; 4.9 ; 4.9 ; 4.8 ; 4.7 ; 4.6 ; 4.3 ; 4.4 ; 4.5 ; 4.2 ; 4.8 ; 4.6 ; 4.9 ; 4.8 ; 4.8 ; 4.6 ; 4.7 ; 4.1 ; 3.8 ; 4 ; 4.1 ; 4 ; 4.1 ; 3.5 ; 4.1 ; 3.6 ; 4 ; 3.9 ; 3.8 ; 4.4 ; 4.7 ; 3.8 ; 4.1 ; 4.1 ; 4.7 ; 4.3 ; 4.4 ; 4.5 ; 3.1 ; 3.7 ; 4.5 ; 3 ; 4.6 ; 3.7 ; 3.6 ; 3.2 ; 3.3 ; 2.9 ; 4.2 ; 4.5 ; 3.8 ; 3.7 ; 3.7 ; 4 ; 3.7 ; 4.5 ; 3.8 ; 3.9 ; 4.6 ; 4.5 ; 4.2 ; 4 ; 3.8 ; 3.5 ; 2.7 ; 4 ; 4.6 ; 3.9 ; 4.5 ; 3.7 ; 2.4 ; 3.1 ; 2.5 ; 3 ; 4.5 ; 4.8 ; 4.9 ; 4.5 ; 4.6 ; 4.5 ; 4.9 ; 4.4 ; 4.6 ; 4.6 ; 5 ; 4.9 ; 4.6 ; 4.8 ; 4.9 ; 4.9 ; 4.9 ; 5 ; 4.5 ; 3.5 ; 3.8 ; 3.9 ; 3.9 ; 4.2 ; 4.1 ; 4.8 ; 4.8 ; 4.8 ; 4.8 ; 4.9 ; 4.2 ; 4.5 ; 3.9 ; 4.4 ; 4 ; 3.6 ; 3.7 ; 2.7 ; 4.5 ; 4.4 ; 3.9 ; 3.6 ; 4.4 ; 4.4 ; 4.7 ; 4.5 ; 4.1 ; 3.7 ; 4.3 ; 3.5 ; 3.7 ; 4 ; 4 ; 3.1 ; 4.5 ; 4.8 ; 4.2 ; 4.9 ; 4.8 ; 3.5 ; 3.6 ; 4.4 ; 3.4 ; 3.9 ; 3.8 ; 4.8 ; 4.6 ; 5 ; 3.8 ; 4.2 ; 3.3 ; 4.7 ; 4.6 ; 4.6 ; 4 ; 4.2 ; 4.9 ; 4.5 ; 4.8 ; 3.8 ; 4.8 ; 5 ; 5 ; 4.9 ; 4.6 ; 5 ; 4.8 ; 4.9 ; 4.9 ; 3.9 ; 3.9 ; 4.5 ; 4.5 ; 3.3 ; 3.1 ; 2.8 ; 3.1 ; 4.2 ; 3.4 ; 3 ; 3.3 ; 3.6 ; 3.7 ; 3.6 ; 4.3 ; 4.1 ; 4.9 ; 4.8 ; 3.7 ; 3.9 ; 4.5 ; 3.6 ; 4.4 ; 3.4 ; 4.4 ; 4.5 ; 4.5 ; 4.5 ; 4.6 ; 4.1 ; 4.5 ; 3.5 ; 4.4 ; 4.4 ; 4.1"
evBty <- "5 ; 5 ; 5 ; 5 ; 3 ; 3 ; 3 ; 3.3 ; 3.3 ; 3.2 ; 3.2 ; 3.2 ; 3.2 ; 3.2 ; 3.2 ; 3.2 ; 3.2 ; 7.3 ; 7.3 ; 7.3 ; 7.3 ; 7.3 ; 7.3 ; 5.5 ; 5.5 ; 5.5 ; 5.5 ; 5.5 ; 5.5 ; 5.5 ; 4.2 ; 4.2 ; 4.2 ; 4.2 ; 4.2 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4.7 ; 4.7 ; 4.7 ; 4.7 ; 4.7 ; 4.7 ; 4.7 ; 5.5 ; 5.5 ; 5.5 ; 5.5 ; 5.5 ; 5.5 ; 5.5 ; 5.5 ; 5.5 ; 5.5 ; 4.8 ; 4.8 ; 4.8 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.8 ; 4.8 ; 4.8 ; 4.8 ; 4.8 ; 4.8 ; 4.8 ; 4 ; 4 ; 4 ; 4 ; 5.5 ; 5.5 ; 5.5 ; 5.5 ; 4.2 ; 4.2 ; 4.2 ; 4.2 ; 4.2 ; 4.2 ; 2.5 ; 2.5 ; 2.5 ; 2.5 ; 2.5 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.8 ; 4.8 ; 4.8 ; 4.8 ; 4.8 ; 4.8 ; 2.8 ; 3 ; 3 ; 3 ; 3 ; 3 ; 4.2 ; 4.2 ; 4.2 ; 4.2 ; 4.2 ; 4.2 ; 4.2 ; 7.8 ; 7.8 ; 3.8 ; 3.8 ; 3.8 ; 3.8 ; 3.8 ; 4.8 ; 4.8 ; 4.8 ; 4.8 ; 4.8 ; 4.8 ; 4.8 ; 3 ; 3 ; 3 ; 3 ; 3 ; 3 ; 3 ; 3 ; 5.2 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 2.7 ; 2.7 ; 2.7 ; 5.5 ; 5.5 ; 5.5 ; 5.5 ; 5.5 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 2.3 ; 2.3 ; 2.3 ; 6.5 ; 6.5 ; 6.5 ; 6.5 ; 2.3 ; 2.3 ; 2.3 ; 2.3 ; 2.3 ; 2.3 ; 2.3 ; 2.3 ; 3 ; 3 ; 3 ; 3.7 ; 3.7 ; 3.7 ; 3.7 ; 3.7 ; 3.7 ; 3.7 ; 3.7 ; 6.2 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4.8 ; 4.8 ; 4.8 ; 4.8 ; 8.2 ; 8.2 ; 8.2 ; 8.2 ; 6.5 ; 6.5 ; 6.5 ; 4.8 ; 4.8 ; 4.8 ; 4.8 ; 7 ; 7 ; 7 ; 4.7 ; 3.8 ; 3.8 ; 3.8 ; 3.2 ; 3.2 ; 3.2 ; 3.2 ; 3.2 ; 3.2 ; 3.2 ; 3.2 ; 3.2 ; 3.2 ; 3.2 ; 3.2 ; 3.2 ; 3.2 ; 3.2 ; 3.2 ; 3.2 ; 3.2 ; 3.2 ; 3.2 ; 5.8 ; 5.8 ; 5.8 ; 5.8 ; 5.8 ; 5.8 ; 5.7 ; 5.7 ; 5.7 ; 5.7 ; 6.5 ; 6.5 ; 6.5 ; 6.5 ; 6.5 ; 6.5 ; 6.5 ; 1.7 ; 1.7 ; 1.7 ; 1.7 ; 1.7 ; 1.7 ; 6.7 ; 6.7 ; 6.7 ; 3.7 ; 3.7 ; 3.7 ; 3.8 ; 3.8 ; 6.2 ; 6.2 ; 3.3 ; 3.3 ; 3.3 ; 3.3 ; 3.3 ; 3.3 ; 3.3 ; 3.3 ; 3.3 ; 3.3 ; 3.7 ; 3.7 ; 3.5 ; 3.5 ; 3.5 ; 2.7 ; 5.7 ; 6 ; 6 ; 6.5 ; 6.5 ; 6.5 ; 2.3 ; 2.3 ; 2.3 ; 2.3 ; 2.3 ; 2.3 ; 2.3 ; 2.3 ; 2.3 ; 2.3 ; 2.3 ; 2.3 ; 2.3 ; 7.2 ; 7.2 ; 1.7 ; 1.7 ; 1.7 ; 5.2 ; 3.5 ; 3.5 ; 3.5 ; 3.5 ; 3.5 ; 3.5 ; 3.5 ; 3.5 ; 3.5 ; 3.3 ; 3.3 ; 3.3 ; 3.3 ; 3.3 ; 3.3 ; 3.3 ; 3.3 ; 3.3 ; 3.3 ; 5.8 ; 5.8 ; 5.8 ; 5.8 ; 5.8 ; 5.8 ; 6.2 ; 6.2 ; 6.2 ; 6.2 ; 6.2 ; 3.3 ; 3.3 ; 3.3 ; 3.3 ; 5.2 ; 5.2 ; 4.2 ; 4.2 ; 2.5 ; 2.5 ; 2.5 ; 2.5 ; 2.5 ; 2.5 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 3 ; 3 ; 3 ; 6.3 ; 6.3 ; 6.3 ; 6.3 ; 3.3 ; 3.3 ; 3.3 ; 3.3 ; 2.8 ; 2.8 ; 2.8 ; 2.8 ; 2.8 ; 2.8 ; 2.8 ; 2.8 ; 2.8 ; 2.8 ; 2.8 ; 6.7 ; 6.7 ; 6.7 ; 6.7 ; 6.7 ; 6.8 ; 6.8 ; 6.8 ; 6.8 ; 6.8 ; 7.8 ; 7.8 ; 7.8 ; 7.8 ; 7.8 ; 7.8 ; 7.8 ; 7.8 ; 7.8 ; 7.8 ; 7.8 ; 5.8 ; 5.8 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 7.8 ; 7.8 ; 7.8 ; 3.3 ; 3.3 ; 4.5 ; 4.5 ; 4.5 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 4.3 ; 6.8 ; 6.8 ; 6.8 ; 6.8 ; 6.8 ; 6.8 ; 5.3 ; 5.3 ; 5.3 ; 5.3"
evals <- data.frame(score=as.numeric(strsplit(evScore, " ; ")[[1]]),
cls_students=as.integer(strsplit(evStudents, " ; ")[[1]]),
bty_avg=as.numeric(strsplit(evBty, " ; ")[[1]])
)
# Inspect evals
glimpse(evals)
## Observations: 463
## Variables: 3
## $ score <dbl> 4.7, 4.1, 3.9, 4.8, 4.6, 4.3, 2.8, 4.1, 3.4, 4.5,...
## $ cls_students <int> 43, 125, 125, 123, 20, 40, 44, 55, 195, 46, 27, 2...
## $ bty_avg <dbl> 5.0, 5.0, 5.0, 5.0, 3.0, 3.0, 3.0, 3.3, 3.3, 3.2,...
# Inspect variable types
glimpse(evals)
## Observations: 463
## Variables: 3
## $ score <dbl> 4.7, 4.1, 3.9, 4.8, 4.6, 4.3, 2.8, 4.1, 3.4, 4.5,...
## $ cls_students <int> 43, 125, 125, 123, 20, 40, 44, 55, 195, 46, 27, 2...
## $ bty_avg <dbl> 5.0, 5.0, 5.0, 5.0, 3.0, 3.0, 3.0, 3.3, 3.3, 3.2,...
# Remove non-factor variables from this vector
cat_vars <- c("rank", "ethnicity", "gender", "language",
"cls_level", "cls_profs", "cls_credits",
"pic_outfit", "pic_color")
# Recode cls_students as cls_type: evals
evals <- evals %>%
# Create new variable
mutate(cls_type = ifelse(cls_students <= 18, "small",
ifelse(cls_students >= 60, "large", "midsize")
)
)
# Scatterplot of score vs. bty_avg
ggplot(evals, aes(x=bty_avg, y=score)) +
geom_point()
# Scatterplot of score vs. bty_avg colored by cls_type
ggplot(data=evals, aes(x=bty_avg, y=score, color=cls_type)) +
geom_point()
Chapter 1 - Exploring categorical data
Exploring categorical data; based on a comic book dataset of DC vs Marvel:
Counts vs proportions - the proportions are often much more meaningful:
Distribution of one variable - the typical way to begin exploring a dataset:
Example code includes:
## ISSUE - do not have (and cannot find) this tibble
comCounts <- c(1573, 2490, 836, 1, 904, 7561, 4809, 1799, 2,
2250, 32, 17, 17, 0, 2, 449, 152, 121, 0, 257
)
comGender <- rep(rep(c("Female", "Male", "Other", NA), each=5),
times=comCounts
)
comAlign <- rep(rep(c("Bad", "Good", "Neutral", "Reformed Criminals", NA), times=4),
times=comCounts
)
comics <- tibble::as_tibble(data.frame(gender=factor(comGender),
align=factor(comAlign)
)
)
# Print the first rows of the data
comics
## # A tibble: 23,272 × 2
## gender align
## <fctr> <fctr>
## 1 Female Bad
## 2 Female Bad
## 3 Female Bad
## 4 Female Bad
## 5 Female Bad
## 6 Female Bad
## 7 Female Bad
## 8 Female Bad
## 9 Female Bad
## 10 Female Bad
## # ... with 23,262 more rows
# Check levels of align
levels(comics$align)
## [1] "Bad" "Good" "Neutral"
## [4] "Reformed Criminals"
# Check the levels of gender
levels(comics$gender)
## [1] "Female" "Male" "Other"
# Create a 2-way contingency table
table(comics$align, comics$gender)
##
## Female Male Other
## Bad 1573 7561 32
## Good 2490 4809 17
## Neutral 836 1799 17
## Reformed Criminals 1 2 0
# Remove align level
comics <- comics %>%
filter(align != "Reformed Criminals") %>%
droplevels()
# Create side-by-side barchart of gender by alignment
ggplot(comics, aes(x = align, fill = gender)) +
geom_bar(position = "dodge")
# Create side-by-side barchart of alignment by gender
ggplot(comics, aes(x = gender, fill = align)) +
geom_bar(position = "dodge") +
theme(axis.text.x = element_text(angle = 90))
# Plot of gender by align
ggplot(comics, aes(x = align, fill = gender)) +
geom_bar()
# Plot proportion of gender, conditional on align
ggplot(comics, aes(x = align, fill = gender)) +
geom_bar(position = "fill")
# Change the order of the levels in align
comics$align <- factor(comics$align,
levels = c("Bad", "Neutral", "Good"))
# Create plot of align
ggplot(comics, aes(x = align)) +
geom_bar()
# Plot of alignment broken down by gender
ggplot(comics, aes(x = align)) +
geom_bar() +
facet_wrap(~ gender)
pieFlavor <- "cherry ; cherry ; cherry ; cherry ; cherry ; cherry ; cherry ; cherry ; cherry ; cherry ; cherry ; cherry ; cherry ; key lime ; key lime ; key lime ; key lime ; key lime ; key lime ; key lime ; key lime ; key lime ; key lime ; key lime ; key lime ; key lime ; key lime ; key lime ; key lime ; boston creme ; boston creme ; boston creme ; boston creme ; boston creme ; boston creme ; boston creme ; boston creme ; boston creme ; boston creme ; boston creme ; boston creme ; boston creme ; boston creme ; boston creme ; strawberry ; strawberry ; strawberry ; strawberry ; strawberry ; strawberry ; strawberry ; strawberry ; strawberry ; strawberry ; strawberry ; blueberry ; blueberry ; blueberry ; blueberry ; blueberry ; blueberry ; blueberry ; blueberry ; blueberry ; blueberry ; blueberry ; blueberry ; blueberry ; blueberry ; apple ; apple ; apple ; apple ; apple ; apple ; apple ; apple ; apple ; apple ; apple ; apple ; apple ; apple ; apple ; apple ; apple ; pumpkin ; pumpkin ; pumpkin ; pumpkin ; pumpkin ; pumpkin ; pumpkin ; pumpkin ; pumpkin ; pumpkin ; pumpkin ; pumpkin"
pies <- data.frame(flavor=factor(strsplit(pieFlavor, " ; ")[[1]]))
# Garden variety pie chart
ggplot(pies, aes(x=factor(1), fill=flavor)) +
geom_bar(position = "fill") +
coord_polar(theta="y") +
labs(x='', y='')
# Put levels of flavor in decending order
lev <- c("apple", "key lime", "boston creme", "blueberry", "cherry", "pumpkin", "strawberry")
pies$flavor <- factor(pies$flavor, levels = lev)
# Create barchart of flavor
ggplot(pies, aes(x = flavor)) +
geom_bar(fill = "chartreuse") +
theme(axis.text.x = element_text(angle = 90))
# If you prefer that it still be multi-colored like the pie
ggplot(pies, aes(x = flavor)) +
geom_bar(aes(fill=flavor)) +
theme(axis.text.x = element_text(angle = 90))
Chapter 2 - Exploring numerical data
Exploring numerical data - cars that were available for sale in a given year (428 x 19 tbl_df):
Distribution of one variable:
Box plots are based around three charcateristics of the data:
Visualization in higher dimensions:
Example code includes:
# Time to create some data . . .
carCityMPG <- "28 ; 28 ; 26 ; 26 ; 26 ; 29 ; 29 ; 26 ; 27 ; 26 ; 26 ; 32 ; 36 ; 32 ; 29 ; 29 ; 29 ; 26 ; 26 ; 26 ; 23 ; 26 ; 25 ; 24 ; 24 ; 24 ; NA ; 28 ; NA ; NA ; 28 ; 28 ; 24 ; 26 ; 26 ; 26 ; 26 ; 26 ; 32 ; 25 ; 25 ; 24 ; 22 ; 32 ; 32 ; 32 ; 35 ; 33 ; 35 ; 20 ; 21 ; 24 ; 22 ; 21 ; 22 ; 22 ; 22 ; 21 ; 21 ; 21 ; 21 ; 21 ; 20 ; 19 ; 26 ; 26 ; 32 ; 26 ; 46 ; 60 ; 19 ; 19 ; 20 ; NA ; 24 ; 20 ; 25 ; NA ; NA ; 21 ; 23 ; 24 ; 20 ; 20 ; 24 ; 20 ; 22 ; 21 ; 20 ; 24 ; 21 ; 24 ; 20 ; 59 ; 24 ; 24 ; 38 ; 24 ; 24 ; 22 ; 22 ; 20 ; 20 ; 20 ; 18 ; 20 ; 18 ; 23 ; 18 ; 18 ; 21 ; 19 ; 21 ; 22 ; 18 ; 17 ; 17 ; 21 ; 21 ; 17 ; 17 ; 18 ; 18 ; 18 ; 17 ; 22 ; 19 ; 17 ; 17 ; 19 ; 18 ; 18 ; 21 ; 20 ; 20 ; 20 ; 20 ; 21 ; 20 ; 19 ; 21 ; 21 ; 20 ; 21 ; 24 ; 22 ; 22 ; 20 ; 23 ; 20 ; 17 ; 18 ; 20 ; 18 ; 20 ; 19 ; 19 ; 20 ; 20 ; 20 ; 19 ; 20 ; 20 ; 18 ; 18 ; 21 ; 17 ; 18 ; 19 ; 18 ; 20 ; 18 ; 18 ; 20 ; 20 ; 20 ; 19 ; 19 ; 20 ; 19 ; 17 ; 17 ; NA ; 20 ; 20 ; 21 ; 21 ; 19 ; 21 ; 19 ; 18 ; 20 ; 20 ; 18 ; 20 ; 20 ; 18 ; 18 ; 20 ; 18 ; 18 ; 17 ; 17 ; 14 ; 19 ; 20 ; 18 ; 18 ; 18 ; 18 ; 18 ; 18 ; 18 ; 17 ; 17 ; 18 ; 18 ; 17 ; 18 ; 18 ; 17 ; 18 ; 18 ; 18 ; 17 ; 17 ; 17 ; 17 ; 17 ; 16 ; 16 ; 13 ; 20 ; 17 ; 19 ; 16 ; 18 ; 16 ; 21 ; 21 ; NA ; NA ; 21 ; 20 ; 19 ; 17 ; 15 ; 20 ; 20 ; 21 ; 16 ; 16 ; 20 ; 21 ; 17 ; 18 ; 18 ; 17 ; NA ; 20 ; 17 ; 17 ; 20 ; 19 ; 18 ; 18 ; 16 ; 16 ; 18 ; 23 ; 23 ; 18 ; 18 ; 16 ; 14 ; 13 ; 21 ; 17 ; 21 ; 21 ; 18 ; 20 ; 20 ; NA ; 18 ; 17 ; 18 ; 17 ; 20 ; 18 ; 20 ; 18 ; 24 ; 26 ; 14 ; 16 ; 14 ; 14 ; 15 ; NA ; 15 ; 15 ; 16 ; 13 ; 10 ; 15 ; 13 ; 13 ; 14 ; 17 ; 16 ; 16 ; 15 ; 19 ; 16 ; 15 ; 17 ; 17 ; 16 ; 16 ; 12 ; 15 ; 13 ; 18 ; 13 ; 13 ; 14 ; 16 ; 17 ; 15 ; 16 ; 19 ; 14 ; 21 ; 18 ; 18 ; 18 ; 13 ; 15 ; 15 ; 19 ; 18 ; 21 ; 21 ; 20 ; 20 ; 16 ; 12 ; 18 ; 22 ; 21 ; 17 ; 19 ; 22 ; 18 ; 15 ; 19 ; 22 ; 17 ; 26 ; 19 ; 16 ; 15 ; 26 ; 18 ; 19 ; 19 ; 16 ; 19 ; NA ; 20 ; 29 ; 19 ; 24 ; 31 ; 21 ; 21 ; 24 ; 29 ; 24 ; 22 ; 18 ; 22 ; 20 ; 14 ; 19 ; 19 ; 18 ; 20 ; 18 ; 17 ; 16 ; 18 ; 18 ; 16 ; 18 ; 16 ; 19 ; 18 ; 19 ; 19 ; 18 ; 19 ; 19 ; 13 ; 14 ; 18 ; 15 ; 13 ; 16 ; 16 ; 16 ; 16 ; 15 ; 14 ; 24 ; 19 ; 17 ; NA ; 15 ; 24 ; 15 ; 17 ; 14 ; 21 ; 22 ; 16 ; 14"
carSUV <- "0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0 ; 0"
carNCyl <- "4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 6 ; 6 ; 4 ; 6 ; 6 ; 4 ; 4 ; 4 ; 6 ; 6 ; 4 ; 4 ; 4 ; 6 ; 6 ; 4 ; 4 ; 4 ; 4 ; 4 ; 3 ; 6 ; 6 ; 6 ; 4 ; 4 ; 6 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 6 ; 6 ; 4 ; 6 ; 4 ; 4 ; 6 ; 4 ; 6 ; 4 ; 6 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 4 ; 4 ; 6 ; 8 ; 8 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 4 ; 6 ; 8 ; 8 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 4 ; 4 ; 6 ; 6 ; 6 ; 6 ; 6 ; 4 ; 4 ; 4 ; 6 ; 4 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 8 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 8 ; 8 ; 8 ; 4 ; 4 ; 4 ; 4 ; 6 ; 6 ; 6 ; 8 ; 5 ; 5 ; 5 ; 6 ; 5 ; 6 ; 6 ; 6 ; 6 ; 6 ; 8 ; 8 ; 8 ; 6 ; 6 ; 8 ; 8 ; 8 ; 6 ; 8 ; 8 ; 8 ; 8 ; 8 ; 6 ; 8 ; 8 ; 8 ; 8 ; 8 ; 6 ; 8 ; 8 ; 8 ; 8 ; 8 ; 8 ; 8 ; 6 ; 8 ; 12 ; 6 ; 8 ; 6 ; 8 ; 8 ; 8 ; 4 ; 4 ; 8 ; 12 ; 5 ; 5 ; 6 ; 6 ; 8 ; 4 ; 4 ; 6 ; 6 ; 6 ; 6 ; 6 ; 8 ; 8 ; 8 ; 6 ; 10 ; 6 ; 8 ; 8 ; 4 ; 6 ; 8 ; 8 ; 8 ; 8 ; 8 ; 4 ; 4 ; -1 ; -1 ; 8 ; 8 ; 12 ; 4 ; 6 ; 6 ; 6 ; 4 ; 6 ; 6 ; 8 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 4 ; 4 ; 4 ; 4 ; 8 ; 8 ; 8 ; 8 ; 8 ; 10 ; 8 ; 6 ; 8 ; 8 ; 8 ; 6 ; 8 ; 8 ; 8 ; 6 ; 6 ; 8 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 8 ; 8 ; 8 ; 6 ; 8 ; 8 ; 8 ; 6 ; 6 ; 6 ; 6 ; 6 ; 8 ; 4 ; 6 ; 6 ; 6 ; 8 ; 6 ; 6 ; 6 ; 6 ; 4 ; 4 ; 6 ; 4 ; 6 ; 8 ; 6 ; 4 ; 4 ; 6 ; 6 ; 4 ; 6 ; 8 ; 6 ; 6 ; 6 ; 4 ; 6 ; 6 ; 8 ; 4 ; 6 ; 6 ; 6 ; 8 ; 6 ; 4 ; 6 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 4 ; 8 ; 4 ; 5 ; 6 ; 6 ; 6 ; 6 ; 4 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 6 ; 8 ; 8 ; 4 ; 6 ; 8 ; 8 ; 6 ; 6 ; 6 ; 8 ; 8 ; 4 ; 4 ; 8 ; 8 ; 6 ; 4 ; 6 ; 6 ; 8 ; 4 ; 4 ; 6 ; 6"
carHP <- "103 ; 103 ; 140 ; 140 ; 140 ; 132 ; 132 ; 130 ; 110 ; 130 ; 130 ; 115 ; 117 ; 115 ; 103 ; 103 ; 103 ; 138 ; 138 ; 138 ; 138 ; 104 ; 104 ; 124 ; 124 ; 124 ; 148 ; 115 ; 120 ; 120 ; 126 ; 126 ; 140 ; 140 ; 140 ; 140 ; 140 ; 140 ; 108 ; 155 ; 155 ; 119 ; 119 ; 130 ; 130 ; 130 ; 108 ; 108 ; 108 ; 175 ; 180 ; 145 ; 200 ; 180 ; 150 ; 150 ; 150 ; 200 ; 200 ; 150 ; 150 ; 170 ; 155 ; 201 ; 160 ; 160 ; 127 ; 160 ; 93 ; 73 ; 170 ; 170 ; 170 ; 160 ; 160 ; 155 ; 163 ; 160 ; 120 ; 175 ; 165 ; 140 ; 175 ; 200 ; 140 ; 182 ; 165 ; 165 ; 155 ; 157 ; 210 ; 157 ; 225 ; 110 ; 115 ; 180 ; 100 ; 150 ; 200 ; 200 ; 170 ; 184 ; 205 ; 200 ; 240 ; 200 ; 240 ; 200 ; 200 ; 250 ; 200 ; 232 ; 220 ; 150 ; 232 ; 224 ; 224 ; 240 ; 240 ; 194 ; 194 ; 260 ; 280 ; 192 ; 195 ; 189 ; 215 ; 224 ; 224 ; 201 ; 205 ; 230 ; 245 ; 265 ; 265 ; 170 ; 200 ; 165 ; 165 ; 212 ; 210 ; 210 ; 225 ; 200 ; 115 ; 170 ; 170 ; 270 ; 170 ; 220 ; 220 ; 220 ; 220 ; 220 ; 184 ; 184 ; 184 ; 225 ; 225 ; 225 ; 184 ; 205 ; 205 ; 255 ; 255 ; 200 ; 239 ; 260 ; 255 ; 227 ; 225 ; 215 ; 215 ; 232 ; 232 ; 168 ; 168 ; 215 ; 215 ; 215 ; 224 ; 302 ; 275 ; 210 ; 210 ; 220 ; 250 ; 212 ; 210 ; 190 ; 270 ; 208 ; 247 ; 300 ; 208 ; 194 ; 225 ; 225 ; 220 ; 220 ; 250 ; 300 ; 330 ; 340 ; 225 ; 225 ; 325 ; 325 ; 325 ; 240 ; 275 ; 300 ; 275 ; 340 ; 340 ; 235 ; 294 ; 390 ; 294 ; 294 ; 390 ; 220 ; 300 ; 290 ; 280 ; 280 ; 239 ; 239 ; 239 ; 349 ; 302 ; 493 ; 215 ; 302 ; 221 ; 302 ; 275 ; 302 ; 210 ; 210 ; 335 ; 420 ; 197 ; 242 ; 268 ; 290 ; 450 ; 180 ; 225 ; 250 ; 333 ; 333 ; 184 ; 225 ; 320 ; 350 ; 350 ; 215 ; 500 ; 193 ; 260 ; 280 ; 240 ; 172 ; 294 ; 294 ; 390 ; 390 ; 300 ; 142 ; 142 ; 197 ; 238 ; 302 ; 493 ; 493 ; 192 ; 349 ; 210 ; 210 ; 271 ; 287 ; 287 ; 340 ; 315 ; 315 ; 315 ; 477 ; 228 ; 258 ; 227 ; 300 ; 180 ; 138 ; 295 ; 320 ; 295 ; 295 ; 230 ; 310 ; 232 ; 275 ; 285 ; 325 ; 316 ; 275 ; 300 ; 305 ; 240 ; 265 ; 225 ; 325 ; 275 ; 185 ; 275 ; 210 ; 240 ; 193 ; 195 ; 192 ; 282 ; 235 ; 235 ; 230 ; 302 ; 292 ; 288 ; 210 ; 215 ; 215 ; 240 ; 185 ; 340 ; 143 ; 185 ; 245 ; 230 ; 325 ; 220 ; 268 ; 165 ; 201 ; 160 ; 160 ; 173 ; 150 ; 190 ; 217 ; 174 ; 130 ; 160 ; 180 ; 165 ; 161 ; 220 ; 340 ; 184 ; 200 ; 250 ; 130 ; 155 ; 280 ; 315 ; 104 ; 215 ; 168 ; 221 ; 302 ; 155 ; 160 ; 245 ; 130 ; 250 ; 140 ; 108 ; 165 ; 165 ; 155 ; 130 ; 115 ; 170 ; 270 ; 170 ; 208 ; 190 ; 185 ; 180 ; 215 ; 150 ; 215 ; 193 ; 190 ; 240 ; 240 ; 195 ; 200 ; 201 ; 240 ; 240 ; 185 ; 185 ; 185 ; 230 ; 230 ; 345 ; 295 ; 175 ; 200 ; 300 ; 300 ; 210 ; 210 ; 215 ; 231 ; 300 ; 143 ; 175 ; 285 ; 300 ; 190 ; 143 ; 207 ; 180 ; 305 ; 165 ; 142 ; 190 ; 190"
carMSRP <- "11690 ; 12585 ; 14610 ; 14810 ; 16385 ; 13670 ; 15040 ; 13270 ; 13730 ; 15460 ; 15580 ; 13270 ; 14170 ; 15850 ; 10539 ; 11839 ; 11939 ; 13839 ; 15389 ; 15389 ; 16040 ; 10280 ; 11155 ; 12360 ; 13580 ; 14630 ; 15500 ; 16999 ; 14622 ; 16722 ; 12740 ; 14740 ; 15495 ; 10995 ; 14300 ; 15825 ; 14850 ; 16350 ; 12965 ; 12884 ; 14500 ; 12269 ; 15568 ; 14085 ; 15030 ; 15295 ; 10760 ; 11560 ; 11290 ; 22180 ; 21900 ; 18995 ; 20370 ; 21825 ; 17985 ; 22000 ; 19090 ; 21840 ; 22035 ; 18820 ; 20220 ; 19135 ; 20320 ; 22735 ; 19860 ; 22260 ; 17750 ; 19490 ; 20140 ; 19110 ; 19339 ; 20339 ; 18435 ; 17200 ; 19270 ; 21595 ; 19999 ; 19312 ; 17232 ; 19240 ; 17640 ; 18825 ; 22450 ; 22395 ; 17735 ; 21410 ; 19945 ; 20445 ; 17262 ; 19560 ; 22775 ; 19635 ; 21965 ; 20510 ; 18715 ; 19825 ; 21055 ; 21055 ; 23820 ; 26990 ; 25940 ; 28495 ; 26470 ; 24895 ; 28345 ; 25000 ; 27995 ; 23495 ; 24225 ; 29865 ; 24130 ; 26860 ; 25955 ; 25215 ; 24885 ; 24345 ; 27370 ; 23760 ; 26960 ; 24589 ; 26189 ; 28495 ; 29795 ; 29995 ; 26000 ; 26060 ; 28370 ; 24695 ; 29595 ; 23895 ; 29282 ; 25700 ; 23290 ; 27490 ; 29440 ; 23675 ; 24295 ; 25645 ; 27145 ; 29345 ; 26560 ; 25920 ; 26510 ; 23785 ; 23215 ; 23955 ; 25135 ; 33195 ; 35940 ; 31840 ; 33430 ; 34480 ; 36640 ; 39640 ; 30795 ; 37995 ; 30245 ; 35495 ; 36995 ; 37245 ; 39995 ; 32245 ; 35545 ; 30835 ; 33295 ; 30950 ; 30315 ; 32445 ; 31145 ; 33995 ; 32350 ; 31045 ; 32415 ; 32495 ; 36895 ; 32280 ; 33480 ; 35920 ; 37630 ; 38830 ; 30895 ; 34495 ; 35995 ; 30860 ; 33360 ; 35105 ; 39465 ; 31545 ; 30920 ; 33180 ; 39235 ; 31745 ; 34845 ; 37560 ; 37730 ; 37885 ; 43755 ; 46100 ; 42490 ; 44240 ; 42840 ; 49690 ; 69190 ; 48040 ; 44295 ; 44995 ; 54995 ; 69195 ; 73195 ; 40720 ; 45445 ; 50595 ; 47955 ; 42845 ; 52545 ; 43895 ; 49995 ; 63120 ; 68995 ; 59995 ; 74995 ; 41010 ; 48450 ; 55750 ; 40095 ; 43495 ; 41815 ; 44925 ; 50470 ; 52120 ; 94820 ; 128420 ; 45707 ; 52800 ; 48170 ; 57270 ; 74320 ; 86970 ; 40670 ; 43175 ; 65000 ; 75000 ; 40565 ; 42565 ; 45210 ; 89765 ; 84600 ; 35940 ; 37390 ; 40590 ; 48195 ; 56595 ; 33895 ; 41045 ; 76200 ; 44535 ; 51535 ; 34495 ; 81795 ; 18345 ; 29380 ; 37530 ; 33260 ; 18739 ; 69995 ; 74995 ; 81995 ; 86995 ; 63200 ; 22388 ; 25193 ; 25700 ; 27200 ; 90520 ; 121770 ; 126670 ; 40320 ; 56170 ; 25092 ; 26992 ; 29562 ; 26910 ; 34390 ; 33500 ; 79165 ; 84165 ; 76765 ; 192465 ; 43365 ; 52365 ; 25045 ; 31545 ; 22570 ; 25130 ; 52795 ; 46995 ; 42735 ; 41465 ; 32235 ; 41475 ; 34560 ; 31890 ; 35725 ; 46265 ; 49995 ; 31849 ; 52775 ; 33840 ; 35695 ; 36945 ; 37000 ; 52195 ; 37895 ; 26545 ; 30295 ; 29670 ; 27560 ; 20449 ; 27905 ; 19635 ; 72250 ; 45700 ; 64800 ; 39195 ; 42915 ; 76870 ; 46470 ; 29995 ; 30492 ; 33112 ; 27339 ; 21595 ; 56665 ; 20585 ; 23699 ; 27710 ; 27930 ; 54765 ; 35515 ; 41250 ; 20255 ; 22515 ; 19860 ; 18690 ; 21589 ; 20130 ; 25520 ; 39250 ; 25995 ; 21087 ; 18892 ; 20939 ; 17163 ; 20290 ; 40840 ; 49090 ; 32845 ; 22225 ; 31230 ; 17475 ; 22290 ; 34895 ; 36395 ; 11905 ; 32455 ; 33780 ; 50670 ; 60670 ; 22595 ; 17495 ; 28739 ; 17045 ; 40845 ; 23560 ; 14165 ; 21445 ; 23895 ; 16497 ; 16695 ; 19005 ; 24955 ; 40235 ; 26135 ; 35145 ; 26395 ; 27020 ; 27490 ; 38380 ; 21795 ; 32660 ; 26930 ; 25640 ; 24950 ; 27450 ; 20615 ; 28750 ; 33995 ; 24780 ; 32780 ; 28790 ; 23845 ; 31370 ; 23495 ; 28800 ; 52975 ; 36100 ; 18760 ; 20310 ; 40340 ; 41995 ; 17630 ; 20300 ; 20215 ; 22010 ; 33540 ; 14385 ; 16530 ; 25717 ; 29322 ; 25395 ; 14840 ; 22350 ; 19479 ; 26650 ; 24520 ; 12800 ; 16495 ; 25935"
carWidth <- "66 ; 66 ; 69 ; 68 ; 69 ; 67 ; 67 ; 67 ; 67 ; 67 ; 67 ; 67 ; 67 ; 68 ; 66 ; 66 ; 66 ; 68 ; 68 ; 68 ; 72 ; 66 ; 66 ; 68 ; 68 ; 68 ; NA ; 67 ; 67 ; 67 ; 67 ; 67 ; 68 ; 67 ; 67 ; 67 ; 68 ; 68 ; 67 ; 68 ; 68 ; 68 ; 68 ; 67 ; 67 ; 67 ; 65 ; 65 ; 65 ; 73 ; 73 ; 70 ; 70 ; 73 ; 67 ; 67 ; 71 ; 71 ; 75 ; 71 ; 71 ; 67 ; 73 ; 73 ; 71 ; 71 ; 68 ; 67 ; 68 ; 67 ; 72 ; 72 ; 72 ; NA ; 70 ; 73 ; 67 ; 72 ; 67 ; 70 ; 67 ; 70 ; 70 ; 74 ; 68 ; 69 ; 69 ; 69 ; 72 ; 71 ; 71 ; 72 ; 72 ; 68 ; 68 ; 68 ; 68 ; 68 ; 68 ; 69 ; 70 ; 69 ; 74 ; 73 ; 73 ; 73 ; 73 ; 70 ; 73 ; 74 ; 74 ; 74 ; 67 ; 64 ; 75 ; 78 ; 78 ; 72 ; 71 ; 72 ; 72 ; 69 ; 72 ; 70 ; 73 ; 68 ; 68 ; 78 ; 78 ; 73 ; 70 ; 72 ; 70 ; 72 ; 72 ; 70 ; 74 ; 69 ; 69 ; 69 ; 72 ; 71 ; 72 ; 68 ; 68 ; 69 ; 68 ; 72 ; 70 ; 70 ; 70 ; 70 ; 71 ; 71 ; 69 ; 69 ; 69 ; 69 ; 69 ; 69 ; 73 ; 74 ; 75 ; 71 ; 74 ; 69 ; 78 ; 69 ; 70 ; 70 ; 71 ; 68 ; 68 ; 73 ; 73 ; 68 ; 68 ; 68 ; 68 ; 68 ; 78 ; 78 ; 74 ; 69 ; 69 ; 71 ; 71 ; 69 ; 72 ; 69 ; 69 ; 71 ; 71 ; 71 ; 72 ; 72 ; 72 ; 72 ; 70 ; 70 ; 71 ; 71 ; 75 ; 70 ; 69 ; 73 ; 73 ; 75 ; 75 ; 75 ; 74 ; 74 ; 75 ; 70 ; 73 ; 72 ; 72 ; 72 ; 73 ; 73 ; 73 ; 71 ; 71 ; 72 ; 73 ; 73 ; 78 ; 78 ; 78 ; 68 ; 73 ; 73 ; 69 ; 69 ; 71 ; 71 ; 73 ; 73 ; 69 ; 69 ; 75 ; 75 ; 72 ; 72 ; 72 ; 71 ; 78 ; 73 ; 73 ; 73 ; 70 ; 70 ; 70 ; 70 ; 72 ; 74 ; 74 ; 70 ; 75 ; 73 ; 73 ; 72 ; 69 ; 69 ; 71 ; 71 ; 71 ; 71 ; 72 ; 66 ; 66 ; NA ; NA ; 72 ; 72 ; 72 ; 68 ; 68 ; 69 ; 69 ; 70 ; 72 ; 72 ; 73 ; 70 ; 72 ; 70 ; 72 ; 70 ; 70 ; 69 ; 69 ; 68 ; 67 ; 79 ; 73 ; 79 ; 79 ; 76 ; 80 ; 79 ; 75 ; 79 ; 79 ; 81 ; 76 ; 80 ; 79 ; 78 ; 77 ; 73 ; 74 ; 75 ; 74 ; 75 ; 72 ; 77 ; 70 ; 72 ; 73 ; 76 ; 74 ; 76 ; 73 ; 76 ; 71 ; 72 ; 72 ; 74 ; 75 ; 72 ; 74 ; 76 ; 72 ; 70 ; 74 ; 72 ; 76 ; 76 ; 75 ; 67 ; 70 ; 70 ; 72 ; 73 ; 72 ; 67 ; 74 ; 71 ; 72 ; 69 ; 70 ; 67 ; 68 ; 71 ; 70 ; 69 ; 70 ; 79 ; 67 ; 73 ; 76 ; 76 ; 66 ; 68 ; 68 ; 71 ; 71 ; 73 ; 67 ; 74 ; 70 ; 71 ; 69 ; 67 ; 68 ; 69 ; 68 ; 70 ; 68 ; 69 ; 69 ; 68 ; 73 ; 78 ; 72 ; 79 ; 79 ; 79 ; 79 ; 77 ; 78 ; 76 ; 76 ; 75 ; 72 ; 77 ; 78 ; 78 ; 72 ; 72 ; 72 ; 77 ; 77 ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA ; NA"
carHwyMPG <- as.integer(strsplit("34 ; 34 ; 37 ; 37 ; 37 ; 36 ; 36 ; 33 ; 36 ; 33 ; 33 ; 38 ; 44 ; 38 ; 33 ; 33 ; 33 ; 34 ; 34 ; 34 ; 30 ; 33 ; 32 ; 32 ; 32 ; 32 ; NA ; 37 ; NA ; NA ; 35 ; 35 ; 33 ; 35 ; 35 ; 35 ; 35 ; 35 ; 38 ; 31 ; 31 ; 31 ; 30 ; 40 ; 40 ; 40 ; 43 ; 39 ; 43 ; 30 ; 32 ; 34 ; 30 ; 32 ; 29 ; 29 ; 30 ; 28 ; 29 ; 28 ; 28 ; 28 ; 27 ; 26 ; 34 ; 34 ; 37 ; 30 ; 51 ; 66 ; 27 ; 27 ; 27 ; NA ; 32 ; 27 ; 34 ; NA ; NA ; 26 ; 28 ; 32 ; 29 ; 30 ; 33 ; 28 ; 28 ; 28 ; 27 ; 33 ; 29 ; 33 ; 29 ; 51 ; 31 ; 31 ; 46 ; 31 ; 31 ; 29 ; 31 ; 29 ; 29 ; 30 ; 28 ; 30 ; 28 ; 32 ; 28 ; 27 ; 29 ; 27 ; 27 ; 30 ; 27 ; 25 ; 25 ; 30 ; 30 ; 26 ; 26 ; 26 ; 26 ; 26 ; 25 ; 30 ; 26 ; 25 ; 25 ; 26 ; 25 ; 26 ; 26 ; 28 ; 28 ; 29 ; 30 ; 28 ; 27 ; 26 ; 29 ; 29 ; 29 ; 30 ; 30 ; 31 ; 29 ; 28 ; 30 ; 28 ; 26 ; 25 ; 27 ; 25 ; 29 ; 27 ; 27 ; 30 ; 30 ; 29 ; 28 ; 29 ; 29 ; 25 ; 27 ; 28 ; 25 ; 26 ; 26 ; 25 ; 29 ; 25 ; 24 ; 26 ; 26 ; 25 ; 25 ; 26 ; 26 ; 27 ; 25 ; 23 ; NA ; 28 ; 28 ; 29 ; 29 ; 26 ; 29 ; 26 ; 25 ; 27 ; 28 ; 25 ; 28 ; 27 ; 24 ; 24 ; 27 ; 25 ; 25 ; 24 ; 24 ; 20 ; 28 ; 30 ; 26 ; 26 ; 26 ; 28 ; 26 ; 26 ; 26 ; 23 ; 23 ; 26 ; 28 ; 24 ; 28 ; 28 ; 24 ; 25 ; 23 ; 25 ; 24 ; 24 ; 25 ; 25 ; 25 ; 21 ; 24 ; 19 ; 26 ; 22 ; 27 ; 20 ; 26 ; 24 ; 29 ; 30 ; NA ; NA ; 28 ; 26 ; 26 ; 24 ; 22 ; 28 ; 28 ; 29 ; 24 ; 23 ; 28 ; 29 ; 25 ; 25 ; 25 ; 25 ; NA ; 29 ; 25 ; 24 ; 25 ; 26 ; 26 ; 26 ; 23 ; 23 ; 23 ; 28 ; 28 ; 25 ; 24 ; 23 ; 21 ; 19 ; 29 ; 22 ; 28 ; 28 ; 26 ; 26 ; 26 ; NA ; 26 ; 24 ; 26 ; 24 ; 29 ; 26 ; 27 ; 24 ; 33 ; 32 ; 18 ; 21 ; 18 ; 18 ; 21 ; NA ; 19 ; 19 ; 19 ; 17 ; 12 ; 20 ; 18 ; 19 ; 17 ; 23 ; 23 ; 22 ; 21 ; 26 ; 21 ; 20 ; 22 ; 21 ; 21 ; 19 ; 16 ; 19 ; 17 ; 24 ; 18 ; 14 ; 17 ; 21 ; 21 ; 19 ; 21 ; 26 ; 18 ; 26 ; 22 ; 21 ; 24 ; 17 ; 20 ; 20 ; 22 ; 23 ; 25 ; 24 ; 26 ; 24 ; 19 ; 16 ; 21 ; 25 ; 27 ; 20 ; 22 ; 27 ; 25 ; 21 ; 26 ; 30 ; 23 ; 33 ; 26 ; 22 ; 19 ; 33 ; 24 ; 25 ; 27 ; 24 ; 26 ; NA ; 25 ; 36 ; 29 ; 34 ; 35 ; 28 ; 28 ; 29 ; 36 ; 30 ; 31 ; 25 ; 29 ; 27 ; 17 ; 26 ; 26 ; 25 ; 26 ; 25 ; 23 ; 20 ; 25 ; 25 ; 22 ; 25 ; 23 ; 26 ; 25 ; 26 ; 26 ; 24 ; 27 ; 27 ; 17 ; 18 ; 23 ; 21 ; 17 ; 19 ; 22 ; 22 ; 21 ; 19 ; 18 ; 29 ; 24 ; 20 ; NA ; 19 ; 29 ; 19 ; 20 ; 18 ; 28 ; 27 ; 20 ; 17", " ; ")[[1]])
## Warning: NAs introduced by coercion
cars <- data.frame(city_mpg=as.integer(strsplit(carCityMPG, " ; ")[[1]]),
suv=as.logical(as.integer(strsplit(carSUV, " ; ")[[1]])),
ncyl=as.integer(strsplit(carNCyl, " ; ")[[1]]),
horsepwr=as.integer(strsplit(carHP, " ; ")[[1]]),
msrp=as.integer(strsplit(carMSRP, " ; ")[[1]]),
width=as.integer(strsplit(carWidth, " ; ")[[1]]),
hwy_mpg=carHwyMPG
)
## Warning in data.frame(city_mpg = as.integer(strsplit(carCityMPG, " ; ")
## [[1]]), : NAs introduced by coercion
## Warning in data.frame(city_mpg = as.integer(strsplit(carCityMPG, " ; ")
## [[1]]), : NAs introduced by coercion
colSums(is.na(cars))
## city_mpg suv ncyl horsepwr msrp width hwy_mpg
## 14 0 0 0 0 28 14
# Learn data structure
str(cars)
## 'data.frame': 428 obs. of 7 variables:
## $ city_mpg: int 28 28 26 26 26 29 29 26 27 26 ...
## $ suv : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ ncyl : int 4 4 4 4 4 4 4 4 4 4 ...
## $ horsepwr: int 103 103 140 140 140 132 132 130 110 130 ...
## $ msrp : int 11690 12585 14610 14810 16385 13670 15040 13270 13730 15460 ...
## $ width : int 66 66 69 68 69 67 67 67 67 67 ...
## $ hwy_mpg : int 34 34 37 37 37 36 36 33 36 33 ...
# Create faceted histogram
ggplot(cars, aes(x = city_mpg)) +
geom_histogram() +
facet_grid(. ~ suv)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 14 rows containing non-finite values (stat_bin).
# Filter cars with 4, 6, 8 cylinders
common_cyl <- filter(cars, ncyl %in% c(4, 6, 8))
# Create box plots of city mpg by ncyl
ggplot(common_cyl, aes(x = as.factor(ncyl), y = city_mpg)) +
geom_boxplot()
## Warning: Removed 11 rows containing non-finite values (stat_boxplot).
# Create overlaid density plots for same data
ggplot(common_cyl, aes(x = city_mpg, fill = as.factor(ncyl))) +
geom_density(alpha = .3)
## Warning: Removed 11 rows containing non-finite values (stat_density).
# Create hist of horsepwr
cars %>%
ggplot(aes(x=horsepwr)) +
geom_histogram() +
ggtitle("Histogram of Horsepower")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Create hist of horsepwr for affordable cars
cars %>%
filter(msrp < 25000) %>%
ggplot(aes(x=horsepwr)) +
geom_histogram() +
xlim(c(90, 550)) +
ggtitle("Histogram of Horsepower\n(Affordable Cars Only)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing non-finite values (stat_bin).
## Warning: Removed 1 rows containing missing values (geom_bar).
# Create hist of horsepwr with binwidth of 3
cars %>%
ggplot(aes(x=horsepwr)) +
geom_histogram(binwidth = 3) +
ggtitle("Histogram of Horsepower\n(Bucket Size=3)")
# Create hist of horsepwr with binwidth of 30
cars %>%
ggplot(aes(x=horsepwr)) +
geom_histogram(binwidth = 30) +
ggtitle("Histogram of Horsepower\n(Bucket Size=30)")
# Create hist of horsepwr with binwidth of 60
cars %>%
ggplot(aes(x=horsepwr)) +
geom_histogram(binwidth = 60) +
ggtitle("Histogram of Horsepower\n(Bucket Size=60)")
# Construct box plot of msrp
cars %>%
ggplot(aes(x = 1, y = msrp)) +
geom_boxplot()
# Exclude outliers from data
cars_no_out <- cars %>%
filter(msrp < 100000)
# Create plot of city_mpg
cars %>%
ggplot(aes(x=city_mpg)) +
geom_density()
## Warning: Removed 14 rows containing non-finite values (stat_density).
# Create plot of width
cars %>%
ggplot(aes(x=width)) +
geom_density()
## Warning: Removed 28 rows containing non-finite values (stat_density).
# Create plot of city_mpg
cars %>%
ggplot(aes(x=factor(1), y=city_mpg)) +
geom_boxplot()
## Warning: Removed 14 rows containing non-finite values (stat_boxplot).
# Create plot of width
cars %>%
ggplot(aes(x=factor(1), y=width)) +
geom_boxplot()
## Warning: Removed 28 rows containing non-finite values (stat_boxplot).
# Facet hists using hwy mileage and ncyl
common_cyl %>%
ggplot(aes(x = hwy_mpg)) +
geom_histogram() +
facet_grid(ncyl ~ suv) +
ggtitle("Histogram of HighwayMPG\n(By Cylinders vs. SUV)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 11 rows containing non-finite values (stat_bin).
Chapter 3 - Numerical summaries
Measures of center - “what is the typical value”?:
Measures of variability - what are the typical distances from “typical”?:
Shape and transformations - modality and skew:
Outliers - observations with extreme values:
Example code includes:
# Create the data assumed for the exercises
data(gapminder, package="gapminder")
gapminder <- tibble::as_tibble(gapminder)
str(gapminder)
## Classes 'tbl_df', 'tbl' and 'data.frame': 1704 obs. of 6 variables:
## $ country : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ year : int 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
## $ lifeExp : num 28.8 30.3 32 34 36.1 ...
## $ pop : int 8425333 9240934 10267083 11537966 13079460 14880372 12881816 13867957 16317921 22227415 ...
## $ gdpPercap: num 779 821 853 836 740 ...
# Create dataset of 2007 data
gap2007 <- filter(gapminder, year == 2007)
# Compute groupwise mean and median lifeExp
gap2007 %>%
group_by(continent) %>%
summarize(mean(lifeExp),
median(lifeExp)
)
## # A tibble: 5 × 3
## continent `mean(lifeExp)` `median(lifeExp)`
## <fctr> <dbl> <dbl>
## 1 Africa 54.80604 52.9265
## 2 Americas 73.60812 72.8990
## 3 Asia 70.72848 72.3960
## 4 Europe 77.64860 78.6085
## 5 Oceania 80.71950 80.7195
# Generate box plots of lifeExp for each continent
gap2007 %>%
ggplot(aes(x = continent, y = lifeExp)) +
geom_boxplot()
# Compute groupwise measures of spread
gap2007 %>%
group_by(continent) %>%
summarize(sd(lifeExp),
IQR(lifeExp),
n()
)
## # A tibble: 5 × 4
## continent `sd(lifeExp)` `IQR(lifeExp)` `n()`
## <fctr> <dbl> <dbl> <int>
## 1 Africa 9.6307807 11.61025 52
## 2 Americas 4.4409476 4.63200 25
## 3 Asia 7.9637245 10.15200 33
## 4 Europe 2.9798127 4.78250 30
## 5 Oceania 0.7290271 0.51550 2
# Generate overlaid density plots
gap2007 %>%
ggplot(aes(x = lifeExp, fill = continent)) +
geom_density(alpha = 0.3)
# Compute stats for lifeExp in Americas
gap2007 %>%
filter(continent == "Americas") %>%
summarize(mean(lifeExp),
sd(lifeExp)
)
## # A tibble: 1 × 2
## `mean(lifeExp)` `sd(lifeExp)`
## <dbl> <dbl>
## 1 73.60812 4.440948
# Compute stats for population
gap2007 %>%
summarize(median(pop),
IQR(pop)
)
## # A tibble: 1 × 2
## `median(pop)` `IQR(pop)`
## <dbl> <dbl>
## 1 10517531 26702008
# Create density plot of old variable
gap2007 %>%
ggplot(aes(x = pop)) +
geom_density()
# Transform the skewed pop variable
gap2007 <- gap2007 %>%
mutate(log_pop = log(pop))
# Create density plot of new variable
gap2007 %>%
ggplot(aes(x = log_pop)) +
geom_density()
# Filter for Asia, add column indicating outliers
gap_asia <- gap2007 %>%
filter(continent == "Asia") %>%
mutate(is_outlier = (lifeExp < 50))
# Remove outliers, create box plot of lifeExp
gap_asia %>%
filter(!is_outlier) %>%
ggplot(aes(x = factor(1), y = lifeExp)) +
geom_boxplot()
Chapter 4 - Case Study
Introducing the data - the email dataset (tibble 3,921 x 21):
Check-in #1:
Check-in #2:
Example code includes:
data(email, package="openintro")
email <- tibble::as_tibble(email)
str(email)
## Classes 'tbl_df', 'tbl' and 'data.frame': 3921 obs. of 21 variables:
## $ spam : num 0 0 0 0 0 0 0 0 0 0 ...
## $ to_multiple : num 0 0 0 0 0 0 1 1 0 0 ...
## $ from : num 1 1 1 1 1 1 1 1 1 1 ...
## $ cc : int 0 0 0 0 0 0 0 1 0 0 ...
## $ sent_email : num 0 0 0 0 0 0 1 1 0 0 ...
## $ time : POSIXct, format: "2012-01-01 00:16:41" "2012-01-01 01:03:59" ...
## $ image : num 0 0 0 0 0 0 0 1 0 0 ...
## $ attach : num 0 0 0 0 0 0 0 1 0 0 ...
## $ dollar : num 0 0 4 0 0 0 0 0 0 0 ...
## $ winner : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ inherit : num 0 0 1 0 0 0 0 0 0 0 ...
## $ viagra : num 0 0 0 0 0 0 0 0 0 0 ...
## $ password : num 0 0 0 0 2 2 0 0 0 0 ...
## $ num_char : num 11.37 10.5 7.77 13.26 1.23 ...
## $ line_breaks : int 202 202 192 255 29 25 193 237 69 68 ...
## $ format : num 1 1 1 1 0 0 1 1 0 1 ...
## $ re_subj : num 0 0 0 0 0 0 0 0 0 0 ...
## $ exclaim_subj: num 0 0 0 0 0 0 0 0 0 0 ...
## $ urgent_subj : num 0 0 0 0 0 0 0 0 0 0 ...
## $ exclaim_mess: num 0 1 6 48 1 1 1 18 1 0 ...
## $ number : Factor w/ 3 levels "none","small",..: 3 2 2 2 1 1 3 2 2 2 ...
# Compute summary statistics
email %>%
group_by(spam) %>%
summarize(median(num_char), IQR(num_char))
## # A tibble: 2 × 3
## spam `median(num_char)` `IQR(num_char)`
## <dbl> <dbl> <dbl>
## 1 0 6.831 13.58225
## 2 1 1.046 2.81800
# Create plot
email %>%
mutate(log_num_char = log(num_char)) %>%
ggplot(aes(x = factor(spam), y = log_num_char)) +
geom_boxplot()
# Create plot for spam and exclaim_mess
email %>% ggplot(aes(x=log(1 + exclaim_mess), fill=factor(spam))) + geom_density(alpha=0.5)
# Create plot of proportion of spam by image
email %>%
mutate(has_image = (image > 0)) %>%
ggplot(aes(x = has_image, fill = factor(spam))) +
geom_bar(position = "fill")
# Do images get counted as attachments?
sum(email$image > email$attach)
## [1] 0
# Question 1
email %>%
filter(dollar > 0) %>%
group_by(spam) %>%
summarize(mean(dollar))
## # A tibble: 2 × 2
## spam `mean(dollar)`
## <dbl> <dbl>
## 1 0 8.211078
## 2 1 3.435897
# Question 2
email %>%
filter(dollar > 10) %>%
ggplot(aes(x = factor(spam))) +
geom_bar()
# Reorder levels
email$number <- factor(email$number, levels=c("none", "small", "big"))
# Construct plot of number
ggplot(email, aes(x=number, fill=factor(spam))) +
geom_bar(position="fill")
Chapter 1 - Introduction to Ideas of Inference
Statistical inference is the process of making claims about a population based on information from a sample of data:
Randomized distributions:
Using the randomization distribution - comparing the observed statistic to the null distribution:
The sample being consistent with the null hypothesis does not “prove” the null hypothesis; you can only “reject” the null hypothesis
Example code includes:
# PROBLEM - I DO NOT HAVE oilabs::rep_sample_n() ; cut/paste to replicate as oilabs_rep_sample_n
# Copied code from https://github.com/OpenIntroOrg/oilabs/blob/master/R/rep_sample_n.R
oilabs_rep_sample_n <- function(tbl, size, replace = FALSE, reps = 1) {
n <- nrow(tbl)
i <- unlist(replicate(reps, sample.int(n, size, replace = replace), simplify = FALSE))
rep_tbl <- cbind(replicate = rep(1:reps,rep(size,reps)), tbl[i,])
dplyr::group_by(rep_tbl, replicate)
}
And, then the actual coding:
data(NHANES, package="NHANES")
# What are the variables in the NHANES dataset?
names(NHANES)
## [1] "ID" "SurveyYr" "Gender"
## [4] "Age" "AgeDecade" "AgeMonths"
## [7] "Race1" "Race3" "Education"
## [10] "MaritalStatus" "HHIncome" "HHIncomeMid"
## [13] "Poverty" "HomeRooms" "HomeOwn"
## [16] "Work" "Weight" "Length"
## [19] "HeadCirc" "Height" "BMI"
## [22] "BMICatUnder20yrs" "BMI_WHO" "Pulse"
## [25] "BPSysAve" "BPDiaAve" "BPSys1"
## [28] "BPDia1" "BPSys2" "BPDia2"
## [31] "BPSys3" "BPDia3" "Testosterone"
## [34] "DirectChol" "TotChol" "UrineVol1"
## [37] "UrineFlow1" "UrineVol2" "UrineFlow2"
## [40] "Diabetes" "DiabetesAge" "HealthGen"
## [43] "DaysPhysHlthBad" "DaysMentHlthBad" "LittleInterest"
## [46] "Depressed" "nPregnancies" "nBabies"
## [49] "Age1stBaby" "SleepHrsNight" "SleepTrouble"
## [52] "PhysActive" "PhysActiveDays" "TVHrsDay"
## [55] "CompHrsDay" "TVHrsDayChild" "CompHrsDayChild"
## [58] "Alcohol12PlusYr" "AlcoholDay" "AlcoholYear"
## [61] "SmokeNow" "Smoke100" "Smoke100n"
## [64] "SmokeAge" "Marijuana" "AgeFirstMarij"
## [67] "RegularMarij" "AgeRegMarij" "HardDrugs"
## [70] "SexEver" "SexAge" "SexNumPartnLife"
## [73] "SexNumPartYear" "SameSex" "SexOrientation"
## [76] "PregnantNow"
# Create bar plot for Home Ownership by Gender
ggplot(NHANES, aes(x = Gender, fill = HomeOwn)) +
geom_bar(position = "fill") +
ylab("Relative frequencies")
# Density for SleepHrsNight colored by SleepTrouble, faceted by HealthGen
ggplot(NHANES, aes(x = SleepHrsNight, col = SleepTrouble)) +
geom_density(adjust = 2) +
facet_wrap(~ HealthGen)
## Warning: Removed 2245 rows containing non-finite values (stat_density).
# Subset the data: homes
homes <- NHANES %>%
select(Gender, HomeOwn) %>%
filter(HomeOwn %in% c("Own", "Rent"))
# Perform one permutation
homes %>%
mutate(HomeOwn_perm = sample(HomeOwn)) %>%
group_by(Gender) %>%
summarize(prop_own_perm = mean(HomeOwn_perm == "Own"),
prop_own = mean(HomeOwn == "Own")) %>%
summarize(diff_perm = diff(prop_own_perm),
diff_orig = diff(prop_own))
## # A tibble: 1 × 2
## diff_perm diff_orig
## <dbl> <dbl>
## 1 -0.01030001 -0.007828723
# Perform 10 permutations
homeown_perm <- homes %>%
oilabs_rep_sample_n(size = nrow(homes), reps = 10) %>%
mutate(HomeOwn_perm = sample(HomeOwn)) %>%
group_by(replicate, Gender) %>%
summarize(prop_own_perm = mean(HomeOwn_perm == "Own"),
prop_own = mean(HomeOwn == "Own")) %>%
summarize(diff_perm = diff(prop_own_perm),
diff_orig = diff(prop_own)) # male - female
# Print differences to console
homeown_perm
## # A tibble: 10 × 3
## replicate diff_perm diff_orig
## <int> <dbl> <dbl>
## 1 1 0.005351495 -0.007828723
## 2 2 -0.016066359 -0.007828723
## 3 3 -0.009064368 -0.007828723
## 4 4 0.004115849 -0.007828723
## 5 5 -0.006593078 -0.007828723
## 6 6 0.001232677 -0.007828723
## 7 7 0.002056440 -0.007828723
## 8 8 0.009058431 -0.007828723
## 9 9 0.009470313 -0.007828723
## 10 10 -0.006593078 -0.007828723
# Dotplot of 10 permuted differences in proportions
ggplot(homeown_perm, aes(x = diff_perm)) +
geom_dotplot(binwidth = 0.001)
# Perform 100 permutations
homeown_perm <- homes %>%
oilabs_rep_sample_n(nrow(homes), reps=100) %>%
mutate(HomeOwn_perm = sample(HomeOwn)) %>%
group_by(replicate, Gender) %>%
summarize(prop_own_perm = mean(HomeOwn_perm == "Own"),
prop_own = mean(HomeOwn == "Own")) %>%
summarize(diff_perm = diff(prop_own_perm),
diff_orig = diff(prop_own)) # male - female
# Dotplot of 100 permuted differences in proportions
ggplot(homeown_perm, aes(x = diff_perm)) +
geom_dotplot(binwidth = 0.001)
# Perform 1000 permutations
homeown_perm <- homes %>%
oilabs_rep_sample_n(nrow(homes), reps=1000) %>%
mutate(HomeOwn_perm = sample(HomeOwn)) %>%
group_by(replicate, Gender) %>%
summarize(prop_own_perm = mean(HomeOwn_perm == "Own"),
prop_own = mean(HomeOwn == "Own")) %>%
summarize(diff_perm = diff(prop_own_perm),
diff_orig = diff(prop_own)) # male - female
# Density plot of 1000 permuted differences in proportions
ggplot(homeown_perm, aes(x = diff_perm)) +
geom_density()
# Plot permuted differences
ggplot(homeown_perm, aes(x = diff_perm)) +
geom_density() +
geom_vline(aes(xintercept = diff_orig),
col = "red")
# Compare permuted differences to observed difference
homeown_perm %>%
summarize(sum(diff_orig >= diff_perm))
## # A tibble: 1 × 1
## `sum(diff_orig >= diff_perm)`
## <int>
## 1 218
Chapter 2 - Completing a randomization study
Gender discrimination case - promotion case study among bank managers:
Distribution of statistics - different forms of the null hypothesis:
Why 0.05 for the critical region?
What is a p-value?
Example code includes:
discPromote <- "promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; not_promoted ; not_promoted ; not_promoted ; not_promoted ; not_promoted ; not_promoted ; not_promoted ; not_promoted ; not_promoted ; not_promoted ; not_promoted ; not_promoted ; not_promoted"
discSex <- "male ; male ; male ; male ; male ; male ; male ; male ; male ; male ; male ; male ; male ; male ; male ; male ; male ; male ; male ; male ; male ; female ; female ; female ; female ; female ; female ; female ; female ; female ; female ; female ; female ; female ; female ; male ; male ; male ; female ; female ; female ; female ; female ; female ; female ; female ; female ; female"
disc <- data.frame(promote=factor(strsplit(discPromote, " ; ")[[1]],
levels=c("not_promoted", "promoted")
),
sex=factor(strsplit(discSex, " ; ")[[1]])
)
# Create a contingency table summarizing the data
table(disc$sex, disc$promote)
##
## not_promoted promoted
## female 10 14
## male 3 21
# Find proportion of each sex who were promoted
disc %>%
group_by(sex) %>%
summarize(promoted_prop=mean(promote == "promoted"))
## # A tibble: 2 × 2
## sex promoted_prop
## <fctr> <dbl>
## 1 female 0.5833333
## 2 male 0.8750000
# Sample the entire data frame 5 times
disc %>%
oilabs_rep_sample_n(size = nrow(disc), reps = 5)
## Source: local data frame [240 x 3]
## Groups: replicate [5]
##
## replicate promote sex
## * <int> <fctr> <fctr>
## 1 1 not_promoted male
## 2 1 promoted male
## 3 1 promoted female
## 4 1 not_promoted female
## 5 1 not_promoted male
## 6 1 promoted male
## 7 1 not_promoted female
## 8 1 promoted female
## 9 1 promoted male
## 10 1 promoted male
## # ... with 230 more rows
# Shuffle the promote variable within replicate
disc %>%
oilabs_rep_sample_n(size = nrow(disc), reps = 5) %>%
mutate(prom_perm = sample(promote))
## Source: local data frame [240 x 4]
## Groups: replicate [5]
##
## replicate promote sex prom_perm
## <int> <fctr> <fctr> <fctr>
## 1 1 not_promoted female promoted
## 2 1 not_promoted female promoted
## 3 1 promoted female not_promoted
## 4 1 not_promoted female promoted
## 5 1 promoted male not_promoted
## 6 1 promoted male not_promoted
## 7 1 promoted female promoted
## 8 1 promoted male not_promoted
## 9 1 not_promoted female not_promoted
## 10 1 not_promoted male promoted
## # ... with 230 more rows
# Find the proportion of promoted in each replicate and sex
disc %>%
oilabs_rep_sample_n(size = nrow(disc), reps = 5) %>%
mutate(prom_perm = sample(promote)) %>%
group_by(replicate, sex) %>%
summarize(prop_prom_perm = mean(prom_perm == "promoted"),
prop_prom = mean(promote == "promoted"))
## Source: local data frame [10 x 4]
## Groups: replicate [?]
##
## replicate sex prop_prom_perm prop_prom
## <int> <fctr> <dbl> <dbl>
## 1 1 female 0.6666667 0.5833333
## 2 1 male 0.7916667 0.8750000
## 3 2 female 0.7500000 0.5833333
## 4 2 male 0.7083333 0.8750000
## 5 3 female 0.7083333 0.5833333
## 6 3 male 0.7500000 0.8750000
## 7 4 female 0.7083333 0.5833333
## 8 4 male 0.7500000 0.8750000
## 9 5 female 0.8333333 0.5833333
## 10 5 male 0.6250000 0.8750000
# Difference in proportion of promoted across sex grouped by gender
disc %>%
oilabs_rep_sample_n(size = nrow(disc), reps = 5) %>%
mutate(prom_perm = sample(promote)) %>%
group_by(replicate, sex) %>%
summarize(prop_prom_perm = mean(prom_perm == "promoted"),
prop_prom = mean(promote == "promoted")) %>%
summarize(diff_perm = diff(prop_prom_perm),
diff_orig = diff(prop_prom)) # male - female
## # A tibble: 5 × 3
## replicate diff_perm diff_orig
## <int> <dbl> <dbl>
## 1 1 0.04166667 0.2916667
## 2 2 0.12500000 0.2916667
## 3 3 -0.12500000 0.2916667
## 4 4 0.12500000 0.2916667
## 5 5 0.20833333 0.2916667
# Create a data frame of differences in promotion rates
disc_perm <- disc %>%
oilabs_rep_sample_n(size = nrow(disc), reps = 1000) %>%
mutate(prom_perm = sample(promote)) %>%
group_by(replicate, sex) %>%
summarize(prop_prom_perm = mean(prom_perm == "promoted"),
prop_prom = mean(promote == "promoted")) %>%
summarize(diff_perm = diff(prop_prom_perm),
diff_orig = diff(prop_prom)) # male - female
# Histogram of permuted differences
ggplot(disc_perm, aes(x = diff_perm)) +
geom_histogram(binwidth = 0.01) +
geom_vline(aes(xintercept = diff_orig), col = "red")
# Find the 0.90, 0.95, and 0.99 quantiles of diff_perm
disc_perm %>%
summarize(q.90 = quantile(diff_perm, p = 0.90),
q.95 = quantile(diff_perm, p = 0.95),
q.99 = quantile(diff_perm, p = 0.99)
)
## # A tibble: 1 × 3
## q.90 q.95 q.99
## <dbl> <dbl> <dbl>
## 1 0.125 0.2083333 0.2916667
# Find the 0.10, 0.05, and 0.01 quantiles of diff_perm
disc_perm %>%
summarize(q.01 = quantile(diff_perm, p = 0.01),
q.05 = quantile(diff_perm, p = 0.05),
q.10 = quantile(diff_perm, p = 0.10)
)
## # A tibble: 1 × 3
## q.01 q.05 q.10
## <dbl> <dbl> <dbl>
## 1 -0.2916667 -0.2083333 -0.2083333
discsmallSex <- "2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 1 ; 1 ; 1 ; 1 ; 1 ; 2 ; 1 ; 1 ; 1" # 2 is male
discbigSex <- "2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1" # 2 is male
discbigPromote <- "2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1 ; 1" # 2 is promote
discsmallPromote <- "2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 2 ; 1 ; 1 ; 1 ; 1" # 2 is promote
dsSex <- factor(strsplit(discsmallSex, " ; ")[[1]],
labels=c("female", "male")
)
dbSex <- factor(strsplit(discbigSex, " ; ")[[1]],
labels=c("female", "male")
)
dsPromote <- factor(strsplit(discsmallPromote, " ; ")[[1]],
labels=c("not_promoted", "promoted")
)
dbPromote <- factor(strsplit(discbigPromote, " ; ")[[1]],
labels=c("not_promoted", "promoted")
)
disc_small <- data.frame(sex=dsSex, promote=dsPromote)
disc_big <- data.frame(sex=dbSex, promote=dbPromote)
# Tabulate the small and big data frames
disc_small %>%
select(sex, promote) %>%
table()
## promote
## sex not_promoted promoted
## female 3 5
## male 1 7
disc_big %>%
select(sex, promote) %>%
table()
## promote
## sex not_promoted promoted
## female 100 140
## male 30 210
# Create a 1000 permutation for each
disc_small_perm <- disc_small %>%
oilabs_rep_sample_n(size = nrow(disc_small), reps = 1000) %>%
mutate(prom_perm = sample(promote)) %>%
group_by(replicate, sex) %>%
summarize(prop_prom_perm = mean(prom_perm == "promoted"),
prop_prom = mean(promote == "promoted")) %>%
summarize(diff_perm = diff(prop_prom_perm),
diff_orig = diff(prop_prom)) # male - female
# Create a 1000 permutation for each
disc_big_perm <- disc_big %>%
oilabs_rep_sample_n(size = nrow(disc_big), reps = 1000) %>%
mutate(prom_perm = sample(promote)) %>%
group_by(replicate, sex) %>%
summarize(prop_prom_perm = mean(prom_perm == "promoted"),
prop_prom = mean(promote == "promoted")) %>%
summarize(diff_perm = diff(prop_prom_perm),
diff_orig = diff(prop_prom)) # male - female
# Plot the distributions of permuted differences
ggplot(disc_small_perm, aes(x = diff_perm)) +
geom_histogram(binwidth = 0.01) +
geom_vline(aes(xintercept = diff_orig), col = "red")
ggplot(disc_big_perm, aes(x = diff_perm)) +
geom_histogram(binwidth = 0.01) +
geom_vline(aes(xintercept = diff_orig), col = "red")
# Recall the quantiles associated with the original dataset
disc_perm %>%
summarize(q.90 = quantile(diff_perm, p = 0.90),
q.95 = quantile(diff_perm, p = 0.95),
q.99 = quantile(diff_perm, p = 0.99)
)
## # A tibble: 1 × 3
## q.90 q.95 q.99
## <dbl> <dbl> <dbl>
## 1 0.125 0.2083333 0.2916667
# Calculate the quantiles associated with the small dataset
disc_small_perm %>%
summarize(q.90 = quantile(diff_perm, p = 0.90),
q.95 = quantile(diff_perm, p = 0.95),
q.99 = quantile(diff_perm, p = 0.99)
)
## # A tibble: 1 × 3
## q.90 q.95 q.99
## <dbl> <dbl> <dbl>
## 1 0.25 0.25 0.5
# Calculate the quantiles associated with the big dataset
disc_big_perm %>%
summarize(q.90 = quantile(diff_perm, p = 0.90),
q.95 = quantile(diff_perm, p = 0.95),
q.99 = quantile(diff_perm, p = 0.99)
)
## # A tibble: 1 × 3
## q.90 q.95 q.99
## <dbl> <dbl> <dbl>
## 1 0.05 0.06666667 0.09175
# Calculate the p-value for the original dataset
disc_perm %>%
summarize(mean(diff_orig <= diff_perm))
## # A tibble: 1 × 1
## `mean(diff_orig <= diff_perm)`
## <dbl>
## 1 0.031
# Calculate the p-value for the small dataset
disc_small_perm %>%
summarize(mean(diff_orig <= diff_perm))
## # A tibble: 1 × 1
## `mean(diff_orig <= diff_perm)`
## <dbl>
## 1 0.29
# Calculate the p-value for the big dataset
disc_big_perm %>%
summarize(mean(diff_orig <= diff_perm))
## # A tibble: 1 × 1
## `mean(diff_orig <= diff_perm)`
## <dbl>
## 1 0
dnPromote <- "promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; promoted ; not_promoted ; not_promoted ; not_promoted ; not_promoted ; not_promoted ; not_promoted ; not_promoted ; not_promoted ; not_promoted ; not_promoted ; not_promoted ; not_promoted ; not_promoted"
dnSex <- "male ; male ; male ; male ; male ; male ; male ; male ; male ; male ; male ; male ; male ; male ; male ; male ; male ; male ; female ; female ; female ; female ; female ; female ; female ; female ; female ; female ; female ; female ; female ; female ; female ; female ; female ; male ; male ; male ; male ; male ; male ; female ; female ; female ; female ; female ; female ; female"
disc_new <- data.frame(promote=factor(strsplit(dnPromote, " ; ")[[1]],
levels=c("not_promoted", "promoted")
),
sex=factor(strsplit(dnSex, " ; ")[[1]])
)
# Create a 1000 permutation for each
disc_perm <- disc %>%
oilabs_rep_sample_n(size = nrow(disc), reps = 1000) %>%
mutate(prom_perm = sample(promote)) %>%
group_by(replicate, sex) %>%
summarize(prop_prom_perm = mean(prom_perm == "promoted"),
prop_prom = mean(promote == "promoted")) %>%
summarize(diff_perm = diff(prop_prom_perm),
diff_orig = diff(prop_prom)) # male - female
disc_new_perm <- disc_new %>%
oilabs_rep_sample_n(size = nrow(disc_new), reps = 1000) %>%
mutate(prom_perm = sample(promote)) %>%
group_by(replicate, sex) %>%
summarize(prop_prom_perm = mean(prom_perm == "promoted"),
prop_prom = mean(promote == "promoted")) %>%
summarize(diff_perm = diff(prop_prom_perm),
diff_orig = diff(prop_prom)) # male - female
# Recall the original data
disc %>%
select(sex, promote) %>%
table()
## promote
## sex not_promoted promoted
## female 10 14
## male 3 21
# Tabulate the new data
disc_new %>%
select(sex, promote) %>%
table()
## promote
## sex not_promoted promoted
## female 7 17
## male 6 18
# Plot the distribution of the original permuted differences
ggplot(disc_perm, aes(x = diff_perm)) +
geom_histogram() +
geom_vline(aes(xintercept = diff_orig), col = "red")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Plot the distribution of the new permuted differences
ggplot(disc_new_perm, aes(x = diff_perm)) +
geom_histogram() +
geom_vline(aes(xintercept = diff_orig), col = "red")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Find the p-value from the original data
disc_perm %>%
summarize(mean(diff_orig <= diff_perm))
## # A tibble: 1 × 1
## `mean(diff_orig <= diff_perm)`
## <dbl>
## 1 0.026
# Find the p-value from the new data
disc_new_perm %>%
summarize(mean(diff_orig <= diff_perm))
## # A tibble: 1 × 1
## `mean(diff_orig <= diff_perm)`
## <dbl>
## 1 0.466
Chapter 3 - Hypothesis Testing Errors
Opportuinity cost - do reminders about saving money encourage students to purchase fewer DVDs? (Frederick et al study):
Errors and their consequences - consequences of various conclusions and associated errors:
Example code includes:
oppDec <- "buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; buyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD ; nobuyDVD"
oppGroup <- "control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; control ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment ; treatment"
opportunity <- data.frame(decision=factor(strsplit(oppDec, " ; ")[[1]]),
group=factor(strsplit(oppGroup, " ; ")[[1]])
)
# Tabulate the data
opportunity %>%
select(decision, group) %>%
table()
## group
## decision control treatment
## buyDVD 56 41
## nobuyDVD 19 34
# Find the proportion who bought the DVD in each group
opportunity %>%
group_by(group) %>%
summarize(buy_prop = mean(decision == "buyDVD"))
## # A tibble: 2 × 2
## group buy_prop
## <fctr> <dbl>
## 1 control 0.7466667
## 2 treatment 0.5466667
# Create a barplot
ggplot(opportunity, aes(x = group, fill = decision)) +
geom_bar(position="fill")
# Data frame of differences in purchase rates after permuting
opp_perm <- opportunity %>%
oilabs_rep_sample_n(size = nrow(opportunity), reps = 1000) %>%
mutate(dec_perm = sample(decision)) %>%
group_by(replicate, group) %>%
summarize(prop_buy_perm = mean(dec_perm == "buyDVD"),
prop_buy = mean(decision == "buyDVD")) %>%
summarize(diff_perm = diff(prop_buy_perm),
diff_orig = diff(prop_buy)) # treatment - control
# Histogram of permuted differences
ggplot(opp_perm, aes(x = diff_perm)) +
geom_histogram(binwidth = .005) +
geom_vline(aes(xintercept = diff_orig), col = "red")
# Calculate the p-value
opp_perm %>%
summarize(mean(diff_perm <= diff_orig))
## # A tibble: 1 × 1
## `mean(diff_perm <= diff_orig)`
## <dbl>
## 1 0.006
# Calculate the two-sided p-value
opp_perm %>%
summarize(2*mean(diff_perm <= diff_orig))
## # A tibble: 1 × 1
## `2 * mean(diff_perm <= diff_orig)`
## <dbl>
## 1 0.012
Chapter 4 - Confidence Intervals
Parameters and confidence intervals - research questions can be comparative (hypothesis test) or estimation (confidence intervals):
Bootstrapping:
Variability in p-hat - how far are the sample data from the parameter?
Interpreting CI and technical conditions:
Example code includes:
# Do not have this dataset (30000 x 2 - poll-vote) - 30 votes in each of 1000 samples
voteSum <- c(9, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25)
voteN <- c(1, 7, 10, 27, 42, 90, 101, 143, 151, 136, 129, 79, 43, 25, 13, 3)
voteAll <- integer(0)
for (intCtr in seq_along(voteSum)) {
vecTemp <- rep(0L, 30)
vecTemp[seq_len(voteSum[intCtr])] <- 1L
voteAll <- c(voteAll, rep(vecTemp, times=voteN[intCtr]))
}
voteNum <- sample(1:1000, 1000, replace=FALSE)
# Needs to be a tibble since oilabs_rep_sample_n() has an implied drop=TRUE for data frames
all_polls <- tibble::as_tibble(data.frame(poll=rep(voteNum, each=30),
vote=voteAll
) %>% arrange(poll)
)
# Select one poll from which to resample: one_poll
one_poll <- all_polls %>%
filter(poll == 1) %>%
select(vote)
# Generate 1000 resamples of one_poll: one_poll_boot_30
one_poll_boot_30 <- one_poll %>%
oilabs_rep_sample_n(size = nrow(one_poll), replace = TRUE, reps = 1000)
# Compute p-hat for each poll: ex1_props
ex1_props <- all_polls %>%
group_by(poll) %>%
summarize(prop_yes = mean(vote))
# Compute p-hat* for each resampled poll: ex2_props
ex2_props <- one_poll_boot_30 %>%
group_by(replicate) %>%
summarize(prop_yes = mean(vote))
# Compare variability of p-hat and p-hat*
ex1_props %>% summarize(sd(prop_yes))
## # A tibble: 1 × 1
## `sd(prop_yes)`
## <dbl>
## 1 0.08683128
ex2_props %>% summarize(sd(prop_yes))
## # A tibble: 1 × 1
## `sd(prop_yes)`
## <dbl>
## 1 0.0840922
# Resample from one_poll with n = 3: one_poll_boot_3
one_poll_boot_3 <- one_poll %>%
oilabs_rep_sample_n(3, replace = TRUE, reps = 1000)
# Resample from one_poll with n = 300: one_poll_boot_300
one_poll_boot_300 <- one_poll %>%
oilabs_rep_sample_n(300, replace = TRUE, reps = 1000)
# Compute p-hat* for each resampled poll: ex3_props
ex3_props <- one_poll_boot_3 %>%
summarize(prop_yes = mean(vote))
# Compute p-hat* for each resampled poll: ex4_props
ex4_props <- one_poll_boot_300 %>%
summarize(prop_yes = mean(vote))
# Compare variability of p-hat* for n = 3 vs. n = 300
ex3_props %>% summarize(sd(prop_yes))
## # A tibble: 1 × 1
## `sd(prop_yes)`
## <dbl>
## 1 0.2610443
ex4_props %>% summarize(sd(prop_yes))
## # A tibble: 1 × 1
## `sd(prop_yes)`
## <dbl>
## 1 0.02643497
# Recall the variability of sample proportions
ex1_props %>% summarize(sd(prop_yes))
## # A tibble: 1 × 1
## `sd(prop_yes)`
## <dbl>
## 1 0.08683128
ex2_props %>% summarize(sd(prop_yes))
## # A tibble: 1 × 1
## `sd(prop_yes)`
## <dbl>
## 1 0.0840922
ex3_props %>% summarize(sd(prop_yes))
## # A tibble: 1 × 1
## `sd(prop_yes)`
## <dbl>
## 1 0.2610443
ex4_props %>% summarize(sd(prop_yes))
## # A tibble: 1 × 1
## `sd(prop_yes)`
## <dbl>
## 1 0.02643497
# Create smoothed density curves for all four experiments
ggplot() +
geom_density(data = ex1_props, aes(x = prop_yes), col = "black", bw = .1) +
geom_density(data = ex2_props, aes(x = prop_yes), col = "green", bw = .1) +
geom_density(data = ex3_props, aes(x = prop_yes), col = "red", bw = .1) +
geom_density(data = ex4_props, aes(x = prop_yes), col = "blue", bw = .1)
# Compute proportion of votes for Candidate X: props
props <- all_polls %>%
group_by(poll) %>%
summarize(prop_yes = mean(vote))
# Proportion of polls within 2SE
props %>%
mutate(lower = mean(prop_yes) - 2 * sd(prop_yes),
upper = mean(prop_yes) + 2 * sd(prop_yes),
in_CI = prop_yes > lower & prop_yes < upper) %>%
summarize(mean(in_CI))
## # A tibble: 1 × 1
## `mean(in_CI)`
## <dbl>
## 1 0.966
# Again, set the one sample that was collected
one_poll <- all_polls %>%
filter(poll == 1) %>%
select(vote)
# Compute p-hat from one_poll: p_hat
p_hat <- mean(one_poll$vote)
# Bootstrap to find the SE of p-hat: one_poll_boot
one_poll_boot <- one_poll %>%
oilabs_rep_sample_n(30, replace = TRUE, reps = 1000) %>%
summarize(prop_yes_boot = mean(vote))
# Create an interval of plausible values
one_poll_boot %>%
summarize(lower = p_hat - 2 * sd(prop_yes_boot),
upper = p_hat + 2 * sd(prop_yes_boot)
)
## # A tibble: 1 × 2
## lower upper
## <dbl> <dbl>
## 1 0.5347743 0.8652257
# Find the 2.5% and 97.5% of the p-hat values
one_poll_boot %>%
summarize(q025_prop = quantile(prop_yes_boot, p = 0.025),
q975_prop = quantile(prop_yes_boot, p = 0.975))
## # A tibble: 1 × 2
## q025_prop q975_prop
## <dbl> <dbl>
## 1 0.5333333 0.8333333
# Bootstrap t-confidence interval for comparison
one_poll_boot %>%
summarize(lower = p_hat - 2 * sd(prop_yes_boot),
upper = p_hat + 2 * sd(prop_yes_boot)
)
## # A tibble: 1 × 2
## lower upper
## <dbl> <dbl>
## 1 0.5347743 0.8652257
# Recall the bootstrap t-confidence interval
p_hat <- mean(one_poll$vote)
one_poll_boot %>%
summarize(lower = p_hat - 2 * sd(prop_yes_boot),
upper = p_hat + 2 * sd(prop_yes_boot))
## # A tibble: 1 × 2
## lower upper
## <dbl> <dbl>
## 1 0.5347743 0.8652257
# Collect a sample of 30 observations from the population
one_poll <- as.tbl(data.frame(vote = rbinom(n = 30, 1, .6)))
# Resample the data using samples of size 300 (an incorrect strategy!)
one_poll_boot_300 <- one_poll %>%
oilabs_rep_sample_n(size=300, replace = TRUE, reps = 1000) %>%
summarize(prop_yes_boot = mean(vote))
# Find the endpoints of the the bootstrap t-confidence interval
one_poll_boot_300 %>%
summarize(lower = p_hat - 2 * sd(prop_yes_boot),
upper = p_hat + 2 * sd(prop_yes_boot)
)
## # A tibble: 1 × 2
## lower upper
## <dbl> <dbl>
## 1 0.6479107 0.7520893
# Resample the data using samples of size 3 (an incorrect strategy!)
one_poll_boot_3 <- one_poll %>%
oilabs_rep_sample_n(size=3, replace = TRUE, reps = 1000) %>%
summarize(prop_yes_boot = mean(vote))
# Find the endpoints of the the bootstrap t-confidence interval
one_poll_boot_3 %>%
summarize(lower = p_hat - 2 * sd(prop_yes_boot),
upper = p_hat + 2 * sd(prop_yes_boot)
)
## # A tibble: 1 × 2
## lower upper
## <dbl> <dbl>
## 1 0.1871818 1.212818
# Collect 30 observations from a population with true proportion of 0.8
one_poll <- as.tbl(data.frame(vote = rbinom(n = 30, size = 1, prob = 0.8)))
# Compute p-hat of new sample: p_hat
p_hat <- mean(one_poll$vote)
# Resample the 30 observations (with replacement)
one_poll_boot <- one_poll %>%
oilabs_rep_sample_n(size=nrow(one_poll), replace = TRUE, reps = 1000) %>%
summarize(prop_yes_boot = mean(vote))
# Calculate the bootstrap t-confidence interval
one_poll_boot %>%
summarize(lower = p_hat - 2 * sd(prop_yes_boot),
upper = p_hat + 2 * sd(prop_yes_boot)
)
## # A tibble: 1 × 2
## lower upper
## <dbl> <dbl>
## 1 0.6104576 0.9228758
# Calculate a 95% bootstrap percentile interval
one_poll_boot %>%
summarize(q025_prop = quantile(prop_yes_boot, p = 0.025),
q975_prop = quantile(prop_yes_boot, p = 0.975))
## # A tibble: 1 × 2
## q025_prop q975_prop
## <dbl> <dbl>
## 1 0.6 0.9
# Calculate a 99% bootstrap percentile interval
one_poll_boot %>%
summarize(q005_prop = quantile(prop_yes_boot, p = 0.005),
q995_prop = quantile(prop_yes_boot, p = 0.995))
## # A tibble: 1 × 2
## q005_prop q995_prop
## <dbl> <dbl>
## 1 0.5666667 0.9333333
# Calculate a 90% bootstrap percentile interval
one_poll_boot %>%
summarize(q05_prop = quantile(prop_yes_boot, p = 0.05),
q95_prop = quantile(prop_yes_boot, p = 0.95))
## # A tibble: 1 × 2
## q05_prop q95_prop
## <dbl> <dbl>
## 1 0.6333333 0.9
Chapter 1 - Correlation and Regression
Modeling bivariate relationships - relationships between two variables:
Characterizing bivariate relationships:
Outliers - points that do not fit with the rest of the data:
Example code includes:
data(ncbirths, package="openintro")
# Scatterplot of weight vs. weeks
ggplot(ncbirths, aes(x=weeks, y=weight)) +
geom_point()
## Warning: Removed 2 rows containing missing values (geom_point).
# Boxplot of weight vs. weeks
ggplot(data = ncbirths,
aes(x = cut(weeks, breaks = 5), y = weight)) +
geom_boxplot()
# Mammals scatterplot
data(mammals, package="openintro")
ggplot(mammals, aes(x=BodyWt, y=BrainWt)) +
geom_point()
# Baseball player scatterplot
data(mlbBat10, package="openintro")
ggplot(mlbBat10, aes(x=OBP, y=SLG)) +
geom_point()
# Body dimensions scatterplot
data(bdims, package="openintro")
ggplot(bdims, aes(x=hgt, y=wgt, color=factor(sex))) +
geom_point()
# Smoking scatterplot
data(smoking, package="openintro")
ggplot(smoking, aes(x=age, y=amtWeekdays)) +
geom_point()
## Warning: Removed 1270 rows containing missing values (geom_point).
# Scatterplot with coord_trans()
ggplot(data = mammals, aes(x = BodyWt, y = BrainWt)) +
geom_point() +
coord_trans(x = "log10", y = "log10")
# Scatterplot with scale_x_log10() and scale_y_log10()
ggplot(data = mammals, aes(x = BodyWt, y = BrainWt)) +
geom_point() +
scale_x_log10() + scale_y_log10()
# Scatterplot of SLG vs. OBP
mlbBat10 %>%
filter(AB >= 200) %>%
ggplot(aes(x = OBP, y = SLG)) +
geom_point()
# Identify the outlying player
mlbBat10 %>%
filter(AB >= 200, OBP < 0.2)
## name team position G AB R H 2B 3B HR RBI TB BB SO SB CS OBP
## 1 B Wood LAA 3B 81 226 20 33 2 0 4 14 47 6 71 1 0 0.174
## SLG AVG
## 1 0.208 0.146
Chapter 2 - Correlation
Quantifying strength of bivariate relationship - correlation:
Anscombe dataset - synthetic datasets of the problems with correlation (and regression):
Interpretation of correlation - correlation is not causality:
Spurious correlation:
Example code includes:
data(ncbirths, package="openintro")
# Compute correlation
ncbirths %>%
summarize(N = n(), r = cor(weight, mage))
## N r
## 1 1000 0.05506589
# Compute correlation for all non-missing pairs
ncbirths %>%
summarize(N = n(), r = cor(weight, weeks, use = "pairwise.complete.obs"))
## N r
## 1 1000 0.6701013
data(anscombe)
Anscombe <- data.frame(x=as.vector(as.matrix(anscombe[,1:4])),
y=as.vector(as.matrix(anscombe[,5:8])),
id=rep(1:11, times=4),
set=rep(1:4, each=11)
)
ggplot(data = Anscombe, aes(x = x, y = y)) +
geom_point() +
facet_wrap(~ set)
# Compute properties of Anscombe
Anscombe %>%
group_by(set) %>%
summarize(N = n(), mean(x), sd(x), mean(y), sd(y), cor(x, y))
## # A tibble: 4 × 7
## set N `mean(x)` `sd(x)` `mean(y)` `sd(y)` `cor(x, y)`
## <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 11 9 3.316625 7.500909 2.031568 0.8164205
## 2 2 11 9 3.316625 7.500909 2.031657 0.8162365
## 3 3 11 9 3.316625 7.500000 2.030424 0.8162867
## 4 4 11 9 3.316625 7.500909 2.030579 0.8165214
data(mlbBat10, package="openintro")
data(mammals, package="openintro")
data(bdims, package="openintro")
# Correlation for all baseball players
mlbBat10 %>%
summarize(N = n(), r = cor(OBP, SLG))
## N r
## 1 1199 0.8145628
# Correlation for all players with at least 200 ABs
mlbBat10 %>%
filter(AB >= 200) %>%
summarize(N = n(), r = cor(OBP, SLG))
## N r
## 1 329 0.6855364
# Correlation of body dimensions
bdims %>%
group_by(sex) %>%
summarize(N = n(), r = cor(hgt, wgt))
## # A tibble: 2 × 3
## sex N r
## <int> <int> <dbl>
## 1 0 260 0.4310593
## 2 1 247 0.5347418
# Correlation among mammals, with and without log
mammals %>%
summarize(N = n(),
r = cor(BodyWt, BrainWt),
r_log = cor(log(BodyWt), log(BrainWt)))
## N r r_log
## 1 62 0.9341638 0.9595748
# Create a random noise dataset
noise <- data.frame(x=rnorm(1000), y=rnorm(1000), z=rep(1:20, each=50))
# Create faceted scatterplot
noise %>%
ggplot(aes(x=x, y=y)) +
geom_point() +
facet_wrap(~ z)
# Compute correlations for each dataset
noise_summary <- noise %>%
group_by(z) %>%
summarize(N = n(), spurious_cor = cor(x, y))
# Isolate sets with correlations above 0.2 in absolute strength
noise_summary %>%
filter(abs(spurious_cor) > 0.2)
## # A tibble: 1 × 3
## z N spurious_cor
## <int> <int> <dbl>
## 1 12 50 -0.2639434
Chapter 3 - Simple Linear Regression
Visualization of linear models - adjusting the intercept and the slope to best fit the data:
Understanding the linear model: Response = f(Explanatory) + Noise:
Regression vs. regression to the mean (Galton):
Example code includes:
# Scatterplot with regression line
ggplot(data = bdims, aes(x = hgt, y = wgt)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
bdims_summary <- bdims %>%
summarize(N=n(), r=cor(hgt, wgt),
mean_hgt=mean(hgt), sd_hgt=sd(hgt),
mean_wgt=mean(wgt), sd_wgt=sd(wgt)
)
# Print bdims_summary
bdims_summary
## N r mean_hgt sd_hgt mean_wgt sd_wgt
## 1 507 0.7173011 171.1438 9.407205 69.14753 13.34576
# Add slope and intercept
bdims_summary %>%
mutate(slope = r * sd_wgt / sd_hgt,
intercept = mean_wgt - slope*mean_hgt
)
## N r mean_hgt sd_hgt mean_wgt sd_wgt slope intercept
## 1 507 0.7173011 171.1438 9.407205 69.14753 13.34576 1.017617 -105.0113
data(GaltonFamilies, package="HistData")
GaltonUse <- GaltonFamilies %>%
mutate(sex=gender, height=childHeight) %>%
select(family, father, mother, sex, height)
GaltonUse <- GaltonUse %>%
left_join(GaltonUse %>% group_by(family) %>% summarize(nkids=n()), by="family")
Galton_women <- GaltonUse %>%
filter(sex=="female")
Galton_men <- GaltonUse %>%
filter(sex=="male")
# Height of children vs. height of father
ggplot(data = Galton_men, aes(x = father, y = height)) +
geom_point() +
geom_abline(slope = 1, intercept = 0) +
geom_smooth(method = "lm", se = FALSE)
# Height of children vs. height of mother
ggplot(data = Galton_women, aes(x = mother, y = height)) +
geom_point() +
geom_abline(slope = 1, intercept = 0) +
geom_smooth(method = "lm", se = FALSE)
Chapter 4 - Interpreting Regression Models
Interpretation of regression coefficients - UCLA textbook pricing (dataset ‘textbooks’):
Linear model object interpretation:
Using the linear model - residuals can give information about biggest outliers (often interesting):
Example code includes:
# Linear model for weight as a function of height
lm(wgt ~ hgt, data = bdims)
##
## Call:
## lm(formula = wgt ~ hgt, data = bdims)
##
## Coefficients:
## (Intercept) hgt
## -105.011 1.018
# Linear model for SLG as a function of OBP
lm(SLG ~ OBP, data=mlbBat10)
##
## Call:
## lm(formula = SLG ~ OBP, data = mlbBat10)
##
## Coefficients:
## (Intercept) OBP
## 0.009407 1.110323
# Log-linear model for body weight as a function of brain weight
lm(log(BodyWt) ~ log(BrainWt), data=mammals)
##
## Call:
## lm(formula = log(BodyWt) ~ log(BrainWt), data = mammals)
##
## Coefficients:
## (Intercept) log(BrainWt)
## -2.509 1.225
mod <- lm(wgt ~ hgt, data = bdims)
# Show the coefficients
coef(mod)
## (Intercept) hgt
## -105.011254 1.017617
# Show the full output
summary(mod)
##
## Call:
## lm(formula = wgt ~ hgt, data = bdims)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.743 -6.402 -1.231 5.059 41.103
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -105.01125 7.53941 -13.93 <2e-16 ***
## hgt 1.01762 0.04399 23.14 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.308 on 505 degrees of freedom
## Multiple R-squared: 0.5145, Adjusted R-squared: 0.5136
## F-statistic: 535.2 on 1 and 505 DF, p-value: < 2.2e-16
# Mean of weights equal to mean of fitted values?
mean(bdims$wgt) == mean(fitted.values(mod))
## [1] TRUE
# Mean of the residuals
mean(residuals(mod))
## [1] -1.266971e-15
# Create bdims_tidy
bdims_tidy <- broom::augment(mod)
# Glimpse the resulting data frame
glimpse(bdims_tidy)
## Observations: 507
## Variables: 9
## $ wgt <dbl> 65.6, 71.8, 80.7, 72.6, 78.8, 74.8, 86.4, 78.4, 62....
## $ hgt <dbl> 174.0, 175.3, 193.5, 186.5, 187.2, 181.5, 184.0, 18...
## $ .fitted <dbl> 72.05406, 73.37697, 91.89759, 84.77427, 85.48661, 7...
## $ .se.fit <dbl> 0.4320546, 0.4520060, 1.0667332, 0.7919264, 0.81834...
## $ .resid <dbl> -6.4540648, -1.5769666, -11.1975919, -12.1742745, -...
## $ .hat <dbl> 0.002154570, 0.002358152, 0.013133942, 0.007238576,...
## $ .sigma <dbl> 9.312824, 9.317005, 9.303732, 9.301360, 9.312471, 9...
## $ .cooksd <dbl> 5.201807e-04, 3.400330e-05, 9.758463e-03, 6.282074e...
## $ .std.resid <dbl> -0.69413418, -0.16961994, -1.21098084, -1.31269063,...
ben <- data.frame(wgt=74.8, hgt=182.8)
# Print ben
ben
## wgt hgt
## 1 74.8 182.8
# Predict the weight of ben
predict(mod, newdata=ben)
## 1
## 81.00909
# Add the line to the scatterplot
ggplot(data = bdims, aes(x = hgt, y = wgt)) +
geom_point() +
geom_abline(data = as.data.frame(t(coef(mod))),
aes(intercept = `(Intercept)`, slope = hgt),
color = "dodgerblue")
Chapter 5 - Model Fit
Assessing model fit - how well does the regression line fit the underlying data?
Comparing model fits:
Unusual points - leverage and influence:
Dealing with unusual points - managing the impacts of leverage and influence:
Example code includes:
mod <- lm(wgt ~ hgt, data = bdims)
# View summary of model
summary(mod)
##
## Call:
## lm(formula = wgt ~ hgt, data = bdims)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.743 -6.402 -1.231 5.059 41.103
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -105.01125 7.53941 -13.93 <2e-16 ***
## hgt 1.01762 0.04399 23.14 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.308 on 505 degrees of freedom
## Multiple R-squared: 0.5145, Adjusted R-squared: 0.5136
## F-statistic: 535.2 on 1 and 505 DF, p-value: < 2.2e-16
# Compute the mean of the residuals
mean(residuals(mod))
## [1] -1.266971e-15
# Compute RMSE
sqrt(sum(residuals(mod)^2) / df.residual(mod))
## [1] 9.30804
# View model summary
summary(mod)
##
## Call:
## lm(formula = wgt ~ hgt, data = bdims)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.743 -6.402 -1.231 5.059 41.103
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -105.01125 7.53941 -13.93 <2e-16 ***
## hgt 1.01762 0.04399 23.14 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.308 on 505 degrees of freedom
## Multiple R-squared: 0.5145, Adjusted R-squared: 0.5136
## F-statistic: 535.2 on 1 and 505 DF, p-value: < 2.2e-16
bdims_tidy <- broom::augment(mod)
# Compute R-squared
bdims_tidy %>%
summarize(var_y = var(wgt), var_e = var(.resid)) %>%
mutate(R_squared = 1 - var_e/var_y)
## var_y var_e R_squared
## 1 178.1094 86.46839 0.5145208
mod <- lm(SLG ~ OBP, data=filter(mlbBat10, AB >= 10))
# Rank points of high leverage
mod %>%
broom::augment() %>%
arrange(desc(.hat)) %>%
head()
## SLG OBP .fitted .se.fit .resid .hat .sigma
## 1 0.000 0.000 -0.03744579 0.009956861 0.03744579 0.01939493 0.07153050
## 2 0.000 0.000 -0.03744579 0.009956861 0.03744579 0.01939493 0.07153050
## 3 0.000 0.000 -0.03744579 0.009956861 0.03744579 0.01939493 0.07153050
## 4 0.308 0.550 0.69049108 0.009158810 -0.38249108 0.01641049 0.07011360
## 5 0.000 0.037 0.01152451 0.008770891 -0.01152451 0.01504981 0.07154283
## 6 0.038 0.038 0.01284803 0.008739031 0.02515197 0.01494067 0.07153800
## .cooksd .std.resid
## 1 0.0027664282 0.5289049
## 2 0.0027664282 0.5289049
## 3 0.0027664282 0.5289049
## 4 0.2427446800 -5.3943121
## 5 0.0002015398 -0.1624191
## 6 0.0009528017 0.3544561
# Rank influential points
mod %>%
broom::augment() %>%
arrange(desc(.cooksd)) %>%
head()
## SLG OBP .fitted .se.fit .resid .hat .sigma
## 1 0.308 0.550 0.69049108 0.009158810 -0.3824911 0.016410487 0.07011360
## 2 0.833 0.385 0.47211002 0.004190644 0.3608900 0.003435619 0.07028875
## 3 0.800 0.455 0.56475653 0.006186785 0.2352435 0.007488132 0.07101125
## 4 0.379 0.133 0.13858258 0.005792344 0.2404174 0.006563752 0.07098798
## 5 0.786 0.438 0.54225666 0.005678026 0.2437433 0.006307223 0.07097257
## 6 0.231 0.077 0.06446537 0.007506974 0.1665346 0.011024863 0.07127661
## .cooksd .std.resid
## 1 0.24274468 -5.394312
## 2 0.04407145 5.056428
## 3 0.04114818 3.302718
## 4 0.03760256 3.373787
## 5 0.03712042 3.420018
## 6 0.03057912 2.342252
# Create nontrivial_players
nontrivial_players <- filter(mlbBat10, AB >= 10 & OBP < 0.5)
# Fit model to new data
mod_cleaner <- lm(SLG ~ OBP, data=nontrivial_players)
# View model summary
summary(mod_cleaner)
##
## Call:
## lm(formula = SLG ~ OBP, data = nontrivial_players)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.31383 -0.04165 -0.00261 0.03992 0.35819
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.043326 0.009823 -4.411 1.18e-05 ***
## OBP 1.345816 0.033012 40.768 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.07011 on 734 degrees of freedom
## Multiple R-squared: 0.6937, Adjusted R-squared: 0.6932
## F-statistic: 1662 on 1 and 734 DF, p-value: < 2.2e-16
# Visualize new model
ggplot(nontrivial_players, aes(x=OBP, y=SLG)) +
geom_point() +
geom_abline(data = as.data.frame(t(coef(mod_cleaner))),
aes(intercept = `(Intercept)`, slope = OBP),
color = "dodgerblue")
# Visualize new model
ggplot(nontrivial_players, aes(x=OBP, y=SLG)) +
geom_point() +
geom_smooth(method="lm")
# Rank high leverage points
mod %>%
broom::augment() %>%
arrange(desc(.hat), .cooksd) %>%
head()
## SLG OBP .fitted .se.fit .resid .hat .sigma
## 1 0.000 0.000 -0.03744579 0.009956861 0.03744579 0.01939493 0.07153050
## 2 0.000 0.000 -0.03744579 0.009956861 0.03744579 0.01939493 0.07153050
## 3 0.000 0.000 -0.03744579 0.009956861 0.03744579 0.01939493 0.07153050
## 4 0.308 0.550 0.69049108 0.009158810 -0.38249108 0.01641049 0.07011360
## 5 0.000 0.037 0.01152451 0.008770891 -0.01152451 0.01504981 0.07154283
## 6 0.038 0.038 0.01284803 0.008739031 0.02515197 0.01494067 0.07153800
## .cooksd .std.resid
## 1 0.0027664282 0.5289049
## 2 0.0027664282 0.5289049
## 3 0.0027664282 0.5289049
## 4 0.2427446800 -5.3943121
## 5 0.0002015398 -0.1624191
## 6 0.0009528017 0.3544561
Chapter 1 - What is statistical modeling?
Statistical models are summaries of data (can be encapsulations, machine learning, etc.):
R objects for statistical modeling - functions, formulae, and data frames:
Example code includes:
# Copy over the function and its core expression
# .expression <- (100 - 5 * (1 - relig_motivation) * (school == "private")) * 1.15^acad_motivation
test_scores <-function(school = "private", acad_motivation = 0, relig_motivation = 0) {
# eval(.expression)
(100 - 5 * (1 - relig_motivation) * (school == "private")) * 1.15^acad_motivation
}
# Baseline run
test_scores(school = "public", acad_motivation = 0, relig_motivation = 0)
## [1] 100
# Change school input, leaving others at baseline
test_scores(school = "private", acad_motivation = 0, relig_motivation = 0)
## [1] 95
# Change acad_motivation input, leaving others at baseline
test_scores(school = "public", acad_motivation = 1, relig_motivation = 0)
## [1] 115
# Change relig_motivation input, leaving others at baseline
test_scores(school = "public", acad_motivation = 0, relig_motivation = 1)
## [1] 100
# Use results above to estimate output for new inputs
my_prediction <- 100 - 5 + (2 * 0) + (2 * 15)
my_prediction
## [1] 125
# Check prediction by using test_scores() directly
test_scores(school = "private", acad_motivation = 2, relig_motivation = 2)
## [1] 138.8625
# Use data() to load Trucking_jobs
data(Trucking_jobs, package="statisticalModeling")
# View the number rows in Trucking_jobs
nrow(Trucking_jobs)
## [1] 129
# Use names() to find variable names in mosaicData::Riders
names(mosaicData::Riders)
## [1] "date" "day" "highT" "lowT" "hi" "lo" "precip"
## [8] "clouds" "riders" "ct" "weekday" "wday"
# Look at the head() of diamonds
head(ggplot2::diamonds)
## # A tibble: 6 × 10
## carat cut color clarity depth table price x y z
## <dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.29 Premium I VS2 62.4 58 334 4.20 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
mean_ <- mosaic::mean_
data(AARP, package="statisticalModeling")
# Find the variable names in AARP
names(AARP)
## [1] "Age" "Sex" "Coverage" "Cost"
# Find the mean cost broken down by sex
mosaic::mean(Cost ~ Sex, data = AARP)
## F M
## 47.29778 57.53056
# Create a boxplot using base, lattice, or ggplot2
boxplot(Cost ~ Sex, data=AARP)
# Make a scatterplot using base, lattice, or ggplot2
plot(Cost ~ Age, data=AARP)
Chapter 2 - Designing and Training Models
Modeling is a process rather than a result:
Evaluating models are assessing how well they match to the real-world (underlying data):
Example code includes:
data(Runners, package="statisticalModeling")
# Find the variable names in Runners
names(Runners)
## [1] "age" "net" "gun" "sex"
## [5] "year" "previous" "nruns" "start_position"
# Build models: handicap_model_1, handicap_model_2, handicap_model_3
handicap_model_1 <- lm(net ~ age, data = Runners)
handicap_model_2 <- lm(net ~ sex, data = Runners)
handicap_model_3 <- lm(net ~ age + sex, data = Runners)
# For now, here's a way to visualize the models
statisticalModeling::fmodel(handicap_model_1)
statisticalModeling::fmodel(handicap_model_2)
statisticalModeling::fmodel(handicap_model_3)
# Build rpart model: model_2
model_2 <- rpart::rpart(net ~ age + sex, data=Runners, cp=0.002)
# Examine graph of model_2 (don't change)
statisticalModeling::fmodel(model_2, ~ age + sex)
# DO NOT HAVE THIS DATASET!
# Create run_again_model
# run_again_model <- rpart(runs_again ~ age + sex + net, data=Ran_twice, cp=0.005)
# Visualize the model (don't change)
# fmodel(run_again_model, ~ age + net, data = Ran_twice)
data(AARP, package="statisticalModeling")
# Display the variable names in the AARP data frame
names(AARP)
## [1] "Age" "Sex" "Coverage" "Cost"
# Build a model: insurance_cost_model
insurance_cost_model <- lm(Cost ~ Age + Sex + Coverage, data=AARP)
# Construct a data frame: example_vals
example_vals <- data.frame(Age=60, Sex="F", Coverage=200)
# Predict insurance cost using predict()
predict(insurance_cost_model, newdata=example_vals)
## 1
## 363.637
# Calculate model output using evaluate_model()
statisticalModeling::evaluate_model(insurance_cost_model, data=example_vals)
## Age Sex Coverage model_output
## 1 60 F 200 363.637
# Build a model: insurance_cost_model
insurance_cost_model <- lm(Cost ~ Age + Sex + Coverage, data = AARP)
# Create a data frame: new_inputs_1
new_inputs_1 <- data.frame(Age = c(30, 90), Sex = c("F", "M"),
Coverage = c(0, 100)
)
# Use expand.grid(): new_inputs_2
new_inputs_2 <- expand.grid(Age = c(30, 90), Sex = c("F", "M"),
Coverage = c(0, 100)
)
# Use predict() for new_inputs_1 and new_inputs_2
predict(insurance_cost_model, newdata = new_inputs_1)
## 1 2
## -99.98726 292.88435
predict(insurance_cost_model, newdata = new_inputs_2)
## 1 2 3 4 5 6 7
## -99.98726 101.11503 -89.75448 111.34781 81.54928 282.65157 91.78206
## 8
## 292.88435
# Use evaluate_model() for new_inputs_1 and new_inputs_2
statisticalModeling::evaluate_model(insurance_cost_model, data = new_inputs_1)
## Age Sex Coverage model_output
## 1 30 F 0 -99.98726
## 2 90 M 100 292.88435
statisticalModeling::evaluate_model(insurance_cost_model, data = new_inputs_2)
## Age Sex Coverage model_output
## 1 30 F 0 -99.98726
## 2 90 F 0 101.11503
## 3 30 M 0 -89.75448
## 4 90 M 0 111.34781
## 5 30 F 100 81.54928
## 6 90 F 100 282.65157
## 7 30 M 100 91.78206
## 8 90 M 100 292.88435
# Evaluate insurance_cost_model
statisticalModeling::evaluate_model(insurance_cost_model)
## Age Sex Coverage model_output
## 1 40 F 0 -66.4702087
## 2 60 F 0 0.5638866
## 3 80 F 0 67.5979818
## 4 40 M 0 -56.2374309
## 5 60 M 0 10.7966643
## 6 80 M 0 77.8307596
## 7 40 F 50 24.2980606
## 8 60 F 50 91.3321558
## 9 80 F 50 158.3662510
## 10 40 M 50 34.5308383
## 11 60 M 50 101.5649336
## 12 80 M 50 168.5990288
# Use fmodel() to reproduce the graphic
statisticalModeling::fmodel(insurance_cost_model, ~ Coverage + Age + Sex)
# A new formula to highlight difference in sexes
new_formula <- ~ Coverage + Sex + Age
# Make the new plot (don't change)
statisticalModeling::fmodel(insurance_cost_model, new_formula)
Chapter 3 - Assessing Prediction Performance
Choosing explanatory variables - depends on the intended purpose for the statistical model:
Cross validation - divide the data in to two non-overlapping datasets, train and test:
Example code includes:
runIDs <- c( 5035 , 10 , 9271 , 256 , 1175 , 17334 , 1571 , 5264 , 15985 , 2237 , 3178 , 7999 , 16462 , 15443 , 13318 , 10409 , 8741 , 5998 , 2860 , 8710 , 3695 , 12340 , 6598 , 6354 , 1125 , 8759 , 7238 , 294 , 2268 , 7219 , 9154 , 5940 , 7464 , 3669 , 14729 , 11636 , 5018 , 1877 , 4639 , 1049 , 4484 , 3896 , 8944 , 11838 , 5960 , 15648 , 11552 , 250 , 9584 , 15110 , 9106 , 10824 , 7706 , 5653 , 4018 , 8028 , 7468 , 14766 , 2945 , 10805 , 2439 , 13616 , 3151 , 10493 , 13595 , 3308 , 1038 , 9019 , 3477 , 11211 , 12410 , 7697 , 7709 , 3699 , 16979 , 9688 , 4891 , 6010 , 6582 , 3983 , 920 , 8972 , 9185 , 4265 , 14708 , 7575 , 3459 , 11727 , 14696 , 4075 , 6604 , 13815 , 260 , 8606 , 14643 , 4323 , 13826 , 3487 , 10602 , 4029 )
runAge <- c( 54 , 27 , 24 , 39 , 52 , 28 , 33 , 40 , 32 , 33 , 30 , 58 , 33 , 46 , 34 , 35 , 50 , 60 , 30 , 28 , 30 , 29 , 56 , 43 , 62 , 60 , 37 , 48 , 27 , 32 , 53 , 43 , 41 , 33 , 29 , 49 , 29 , 24 , 45 , 34 , 56 , 51 , 41 , 38 , 33 , 29 , 34 , 31 , 35 , 43 , 29 , 30 , 30 , 33 , 33 , 46 , 45 , 51 , 32 , 44 , 37 , 46 , 28 , 31 , 51 , 40 , 44 , 28 , 48 , 28 , 44 , 58 , 27 , 33 , 42 , 45 , 36 , 37 , 26 , 47 , 39 , 38 , 36 , 66 , 50 , 31 , 34 , 26 , 53 , 44 , 45 , 24 , 33 , 34 , 50 , 31 , 54 , 38 , 31 , 40 )
runNet <- c( 90 , 74.22 , 90.85 , 91.7 , 94.13 , 99.13 , 78.98 , 102.6 , 111.6 , 100.9 , 81.37 , 82.63 , 83.32 , 71.17 , 73.62 , 79.32 , 111.5 , 86.62 , 111.3 , 69.7 , 66.5 , 65.52 , 99.38 , 89.52 , 76.23 , 79.2 , 59.88 , 124.5 , 107.5 , 105.5 , 78.1 , 99.22 , 96.68 , 59.25 , 94.75 , 93.45 , 76.15 , 91.53 , 75.07 , 80.9 , 94.18 , 97.57 , 86.73 , 92.77 , 99.67 , 85.38 , 65.97 , 77.38 , 94.42 , 78.92 , 87.03 , 97.78 , 86.82 , 113.1 , 88.58 , 74.05 , 88.52 , 83.73 , 81.4 , 69 , 78.43 , 101.2 , 81.2 , 84.45 , 105.1 , 70.38 , 83.28 , 106.5 , 79.12 , 69.83 , 73.35 , 66.07 , 86.23 , 76.72 , 91.88 , 79.12 , 81.63 , 79.67 , 86.62 , 71.63 , 99.28 , 90.58 , 101.2 , 95.8 , 77.58 , 102.4 , 79.67 , 111.2 , 76.88 , 104.4 , 117.4 , 86.68 , 94.78 , 86.1 , 79.63 , 79.23 , 94.97 , 85.67 , 97.07 , 83.15 )
runGun <- c( 90.28 , 75.08 , 93.55 , 95.18 , 99.4 , 105.6 , 81.5 , 107.8 , 116.6 , 104.6 , 82.18 , 82.95 , 84.32 , 71.32 , 74.68 , 80.52 , 114.8 , 87.05 , 115.6 , 70.17 , 66.75 , 66.07 , 105.2 , 95.63 , 81.27 , 80.13 , 60.02 , 125.1 , 107.5 , 110 , 78.53 , 109.6 , 102.5 , 59.43 , 101.1 , 100.3 , 76.47 , 96.98 , 76.43 , 82.45 , 97.8 , 103.6 , 89.53 , 93.63 , 104.5 , 89.73 , 66.25 , 78.62 , 99.47 , 79.15 , 91.13 , 105.4 , 89.85 , 117.8 , 89.45 , 74.93 , 89.2 , 87.32 , 87.9 , 69.13 , 79.97 , 111 , 84.5 , 85.55 , 110.5 , 74.15 , 83.58 , 114.7 , 79.62 , 70.42 , 73.85 , 66.3 , 92.37 , 77.53 , 98.77 , 79.65 , 85.17 , 85.67 , 92.68 , 72.15 , 107.6 , 96.18 , 103.4 , 99.55 , 78.85 , 107 , 81.42 , 114.4 , 77.85 , 108.5 , 121.7 , 92.68 , 96.87 , 88.08 , 80.43 , 79.93 , 99.3 , 90.47 , 102.3 , 84.75 )
runSex <- c( 'F' , 'M' , 'F' , 'F' , 'M' , 'M' , 'M' , 'F' , 'F' , 'F' , 'M' , 'M' , 'M' , 'M' , 'M' , 'F' , 'M' , 'M' , 'F' , 'M' , 'M' , 'M' , 'F' , 'M' , 'F' , 'M' , 'M' , 'F' , 'F' , 'F' , 'F' , 'F' , 'M' , 'M' , 'F' , 'M' , 'F' , 'F' , 'M' , 'M' , 'M' , 'M' , 'F' , 'F' , 'F' , 'M' , 'M' , 'M' , 'M' , 'M' , 'F' , 'F' , 'M' , 'M' , 'M' , 'M' , 'M' , 'M' , 'M' , 'F' , 'M' , 'F' , 'M' , 'F' , 'M' , 'M' , 'M' , 'F' , 'M' , 'M' , 'M' , 'M' , 'M' , 'M' , 'M' , 'M' , 'F' , 'M' , 'M' , 'M' , 'F' , 'F' , 'F' , 'M' , 'M' , 'F' , 'M' , 'F' , 'M' , 'F' , 'F' , 'M' , 'F' , 'M' , 'M' , 'F' , 'M' , 'M' , 'F' , 'M' )
runYear <- c( 2004 , 2001 , 2000 , 2004 , 2005 , 2003 , 2002 , 2001 , 2004 , 2005 , 2005 , 2005 , 2002 , 2004 , 2003 , 2005 , 2005 , 2002 , 2006 , 2006 , 2005 , 2003 , 2004 , 2003 , 2003 , 2003 , 2003 , 2006 , 2004 , 2002 , 2005 , 2006 , 2004 , 2005 , 2004 , 2002 , 2002 , 2004 , 2004 , 2002 , 2001 , 2004 , 2001 , 2002 , 2003 , 2005 , 2004 , 2001 , 2005 , 2003 , 2004 , 2004 , 2003 , 2002 , 2005 , 2002 , 2000 , 2001 , 2005 , 2006 , 2004 , 2006 , 2000 , 2004 , 2002 , 2002 , 2004 , 2006 , 2004 , 2002 , 2005 , 2000 , 2005 , 2003 , 2004 , 2003 , 2005 , 2003 , 2005 , 2004 , 2005 , 2001 , 2000 , 2000 , 2001 , 2002 , 2005 , 2004 , 2006 , 2001 , 2005 , 2005 , 2003 , 2001 , 2005 , 2000 , 2002 , 2004 , 2004 , 2006 )
runPrevious <- c( 5 , 1 , 0 , 1 , 1 , 1 , 1 , 1 , 2 , 2 , 4 , 5 , 0 , 5 , 1 , 0 , 3 , 3 , 0 , 2 , 1 , 0 , 1 , 1 , 4 , 1 , 0 , 4 , 2 , 1 , 4 , 1 , 1 , 4 , 1 , 1 , 1 , 1 , 0 , 2 , 2 , 1 , 1 , 1 , 0 , 2 , 2 , 2 , 2 , 1 , 2 , 1 , 0 , 1 , 1 , 0 , 1 , 0 , 3 , 1 , 1 , 1 , 1 , 3 , 2 , 1 , 5 , 1 , 5 , 0 , 6 , 1 , 1 , 2 , 2 , 1 , 3 , 0 , 0 , 1 , 0 , 1 , 1 , 1 , 2 , 1 , 1 , 1 , 0 , 1 , 3 , 1 , 0 , 1 , 0 , 1 , 0 , 3 , 1 , 4 )
runNRuns <- c( 9 , 8 , 4 , 3 , 4 , 5 , 4 , 6 , 3 , 4 , 6 , 6 , 4 , 8 , 4 , 3 , 7 , 8 , 3 , 4 , 3 , 4 , 6 , 4 , 5 , 3 , 3 , 5 , 4 , 4 , 6 , 4 , 5 , 6 , 4 , 4 , 3 , 3 , 5 , 8 , 7 , 5 , 8 , 3 , 3 , 4 , 5 , 5 , 3 , 5 , 3 , 4 , 4 , 3 , 3 , 3 , 4 , 3 , 5 , 4 , 4 , 4 , 5 , 6 , 5 , 3 , 10 , 4 , 9 , 5 , 7 , 3 , 4 , 5 , 4 , 4 , 6 , 5 , 4 , 3 , 3 , 3 , 9 , 6 , 3 , 3 , 3 , 4 , 3 , 7 , 4 , 3 , 5 , 6 , 3 , 4 , 3 , 4 , 3 , 6 )
runStart_Position <- c( 'eager' , 'eager' , 'calm' , 'mellow' , 'mellow' , 'mellow' , 'calm' , 'mellow' , 'mellow' , 'mellow' , 'eager' , 'eager' , 'eager' , 'eager' , 'calm' , 'calm' , 'mellow' , 'eager' , 'mellow' , 'eager' , 'eager' , 'eager' , 'mellow' , 'mellow' , 'mellow' , 'eager' , 'eager' , 'eager' , 'eager' , 'mellow' , 'eager' , 'mellow' , 'mellow' , 'eager' , 'mellow' , 'mellow' , 'eager' , 'mellow' , 'calm' , 'calm' , 'mellow' , 'mellow' , 'calm' , 'eager' , 'mellow' , 'mellow' , 'eager' , 'calm' , 'mellow' , 'eager' , 'mellow' , 'mellow' , 'mellow' , 'mellow' , 'eager' , 'eager' , 'eager' , 'mellow' , 'mellow' , 'eager' , 'calm' , 'mellow' , 'mellow' , 'calm' , 'mellow' , 'mellow' , 'eager' , 'mellow' , 'eager' , 'eager' , 'eager' , 'eager' , 'mellow' , 'eager' , 'mellow' , 'eager' , 'mellow' , 'mellow' , 'mellow' , 'eager' , 'mellow' , 'mellow' , 'calm' , 'mellow' , 'calm' , 'mellow' , 'calm' , 'mellow' , 'eager' , 'mellow' , 'mellow' , 'mellow' , 'calm' , 'calm' , 'eager' , 'eager' , 'mellow' , 'mellow' , 'mellow' , 'calm' )
Runners_100 <- data.frame(age=as.integer(runAge),
net=runNet,
gun=runGun,
sex=runSex,
year=as.integer(runYear),
previous=as.integer(runPrevious),
nruns=as.integer(runNRuns),
start_position=runStart_Position,
orig.id=as.integer(runIDs),
stringsAsFactors=FALSE
)
str(Runners_100)
## 'data.frame': 100 obs. of 9 variables:
## $ age : int 54 27 24 39 52 28 33 40 32 33 ...
## $ net : num 90 74.2 90.8 91.7 94.1 ...
## $ gun : num 90.3 75.1 93.5 95.2 99.4 ...
## $ sex : chr "F" "M" "F" "F" ...
## $ year : int 2004 2001 2000 2004 2005 2003 2002 2001 2004 2005 ...
## $ previous : int 5 1 0 1 1 1 1 1 2 2 ...
## $ nruns : int 9 8 4 3 4 5 4 6 3 4 ...
## $ start_position: chr "eager" "eager" "calm" "mellow" ...
## $ orig.id : int 5035 10 9271 256 1175 17334 1571 5264 15985 2237 ...
# Build a model of net running time
base_model <- lm(net ~ age + sex, data = Runners_100)
# Evaluate base_model on the training data
base_model_output <- predict(base_model, newdata = Runners_100)
# Build the augmented model
aug_model <- lm(net ~ age + sex + previous, data = Runners_100)
# Evaluate aug_model on the training data
aug_model_output <- predict(aug_model, newdata = Runners_100)
# How much do the model outputs differ?
mean((base_model_output - aug_model_output) ^ 2, na.rm = TRUE)
## [1] 0.5157921
# Build and evaluate the base model on Runners_100
base_model <- lm(net ~ age + sex, data = Runners_100)
base_model_output <- predict(base_model, newdata = Runners_100)
# Build and evaluate the augmented model on Runners_100
aug_model <- lm(net ~ age + sex + previous, data=Runners_100)
aug_model_output <- predict(aug_model, newdata = Runners_100)
# Find the case-by-case differences
base_model_differences <- with(Runners_100, net - base_model_output)
aug_model_differences <- with(Runners_100, net - aug_model_output)
# Calculate mean square errors
mean(base_model_differences ^ 2)
## [1] 131.5594
mean(aug_model_differences ^ 2)
## [1] 131.0436
data(CPS85, package="mosaicData")
# Add bogus column to CPS85 (don't change)
CPS85$bogus <- rnorm(nrow(CPS85)) > 0
# Make the base model
base_model <- lm(wage ~ educ + sector + sex, data = CPS85)
# Make the bogus augmented model
aug_model <- lm(wage ~ educ + sector + sex + bogus, data = CPS85)
# Find the MSE of the base model
mean((CPS85$wage - predict(base_model, newdata = CPS85)) ^ 2)
## [1] 19.73308
# Find the MSE of the augmented model
mean((CPS85$wage - predict(aug_model, newdata = CPS85)) ^ 2)
## [1] 19.71618
# Generate a random TRUE or FALSE for each case in Runners_100
Runners_100$training_cases <- rnorm(nrow(Runners_100)) > 0
# Build base model net ~ age + sex with training cases
base_model <-
lm(net ~ age + sex, data = subset(Runners_100, training_cases))
# Evaluate the model for the testing cases
Preds <-
statisticalModeling::evaluate_model(base_model, data = subset(Runners_100, !training_cases))
# Calculate the MSE on the testing data
with(data = Preds, mean((net - model_output)^2))
## [1] 134.1748
# The model
model <- lm(net ~ age + sex, data = Runners_100)
# Find the in-sample error (using the training data)
in_sample <- statisticalModeling::evaluate_model(model, data = Runners_100)
in_sample_error <-
with(in_sample, mean((net - model_output)^2, na.rm = TRUE))
# Calculate MSE for many different trials
trials <- statisticalModeling::cv_pred_error(model)
# View the cross-validated prediction errors
trials
## mse model
## 1 136.7139 model
## 2 141.0271 model
## 3 137.0930 model
## 4 144.8609 model
## 5 140.4862 model
# Find confidence interval on trials and compare to training_error
mosaic::t.test(~ mse, mu = in_sample_error, data = trials)
##
## One Sample t-test
##
## data: trials$mse
## t = 5.7045, df = 4, p-value = 0.004668
## alternative hypothesis: true mean is not equal to 131.5594
## 95 percent confidence interval:
## 135.9105 144.1620
## sample estimates:
## mean of x
## 140.0362
# The base model
base_model <- lm(net ~ age + sex, data = Runners_100)
# An augmented model adding previous as an explanatory variable
aug_model <- lm(net ~ age + sex + previous, data = Runners_100)
# Run cross validation trials on the two models
trials <- statisticalModeling::cv_pred_error(base_model, aug_model)
# Compare the two sets of cross-validated errors
t.test(mse ~ model, data = trials)
##
## Welch Two Sample t-test
##
## data: mse by model
## t = 0.52618, df = 6.3781, p-value = 0.6166
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -2.694470 4.197952
## sample estimates:
## mean in group aug_model mean in group base_model
## 142.3673 141.6156
Chapter 4 - Exploring data with models
Prediction error for categorical variables:
Exploring data for relationships - example of the NHANES data from library(NHANES):
Example code includes:
data(Runners, package="statisticalModeling")
# Build the null model with rpart()
Runners$all_the_same <- 1 # null "explanatory" variable
null_model <- rpart::rpart(start_position ~ all_the_same, data = Runners)
# Evaluate the null model on training data
null_model_output <- statisticalModeling::evaluate_model(null_model, data = Runners, type = "class")
# Calculate the error rate
with(data = null_model_output, mean(start_position != model_output, na.rm = TRUE))
## [1] 0.5853618
# Generate a random guess...
null_model_output$random_guess <- mosaic::shuffle(Runners$start_position)
# ...and find the error rate
with(data = null_model_output, mean(start_position != random_guess, na.rm = TRUE))
## [1] 0.6530868
# Train the model
model <- rpart::rpart(start_position ~ age + sex, data = Runners, cp = 0.001)
# Get model output with the training data as input
model_output <- statisticalModeling::evaluate_model(model, data = Runners, type = "class")
# Find the error rate
with(data = model_output, mean(start_position != model_output, na.rm = TRUE))
## [1] 0.5567794
# Do not have this data (should be 93x11 for Training_data and 107x11 for Testing_data) - orig.id, all_the_same, training_case
trainData <- c( 14340 , 1667 , 14863 , 15211 , 685 , 16629 , 16620 , 683 , 9695 , 4281 , 15395 , 17308 , 14847 , 2405 , 15696 , 6351 , 10266 , 14345 , 1145 , 9968 , 3409 , 3798 , 4209 , 2084 , 15561 , 7700 , 8620 , 17266 , 1638 , 13963 , 8621 , 14871 , 2945 , 14359 , 9723 , 10371 , 14271 , 826 , 4843 , 15191 , 14171 , 11845 , 15223 , 9213 , 4913 , 8194 , 15509 , 4562 , 15231 , 14317 , 2933 , 2866 , 15242 , 11343 , 15388 , 1104 , 13734 , 17186 , 5427 , 16100 , 5262 , 5873 , 5067 , 1073 , 3164 , 2164 , 1292 , 12337 , 13895 , 4379 , 11012 , 11872 , 10098 , 1130 , 1357 , 6150 , 493 , 7858 , 8761 , 18014 , 445 , 4207 , 15893 , 17022 , 703 , 17615 , 12517 , 181 , 9864 , 8611 , 4171 , 1732 , 11067 )
testData <- c( 16376 , 1316 , 15357 , 8699 , 13896 , 12064 , 13525 , 11807 , 13152 , 4473 , 12926 , 1134 , 7664 , 6597 , 17254 , 5991 , 17042 , 2701 , 2509 , 13264 , 10998 , 10482 , 7534 , 351 , 5866 , 18107 , 18046 , 15454 , 10602 , 10974 , 6988 , 7771 , 8223 , 14225 , 4409 , 2361 , 11462 , 4987 , 8440 , 2483 , 14984 , 14880 , 311 , 7505 , 4371 , 2434 , 15410 , 16068 , 16252 , 5942 , 8123 , 15375 , 15016 , 2379 , 7099 , 5664 , 11381 , 10688 , 1525 , 5506 , 4900 , 16574 , 14272 , 13912 , 3779 , 14584 , 15809 , 2908 , 16329 , 12042 , 1621 , 9248 , 5738 , 1345 , 6319 , 12575 , 3805 , 2895 , 15004 , 9918 , 11422 , 3592 , 10136 , 5941 , 12274 , 14178 , 4667 , 3393 , 11801 , 3814 , 8244 , 11721 , 14940 , 2572 , 14719 , 11398 , 13704 , 17989 , 12056 , 8215 , 8894 , 8303 , 7816 , 14698 , 17293 , 469 , 3533 )
Testing_data <- Runners[complete.cases(Runners), ][testData, ] %>%
mutate(orig.id=as.character(testData), all_the_same=1, training_case=FALSE)
Training_data <- Runners[complete.cases(Runners), ][trainData, ] %>%
mutate(orig.id=as.character(trainData), all_the_same=1, training_case=TRUE)
# Train the models
null_model <- rpart::rpart(start_position ~ all_the_same,
data = Training_data, cp = 0.001)
model_1 <- rpart::rpart(start_position ~ age,
data = Training_data, cp = 0.001)
model_2 <- rpart::rpart(start_position ~ age + sex,
data = Training_data, cp = 0.001)
# Find the out-of-sample error rate
null_output <- statisticalModeling::evaluate_model(null_model, data = Testing_data, type = "class")
model_1_output <- statisticalModeling::evaluate_model(model_1, data = Testing_data, type = "class")
model_2_output <- statisticalModeling::evaluate_model(model_2, data = Testing_data, type = "class")
# Calculate the error rates
null_rate <- with(data = null_output,
mean(start_position != model_output, na.rm = TRUE))
model_1_rate <- with(data = model_1_output,
mean(start_position != model_output, na.rm = TRUE))
model_2_rate <- with(data = model_2_output,
mean(start_position != model_output, na.rm = TRUE))
# Display the error rates
null_rate
## [1] 0.5233645
model_1_rate
## [1] 0.588785
model_2_rate
## [1] 0.5700935
model_2 <- rpart::rpart(net ~ age + sex, data = Runners, cp = 0.001)
rpart.plot::prp(model_2, type = 3)
data(Birth_weight, package="statisticalModeling")
model_1 <- rpart::rpart(baby_wt ~ smoke + income,
data = Birth_weight)
model_2 <- rpart::rpart(baby_wt ~ mother_age + mother_wt,
data = Birth_weight)
rpart.plot::prp(model_1, type = 3)
rpart.plot::prp(model_2, type = 3)
model_3 <- rpart::rpart(baby_wt ~ smoke + income + mother_age + mother_wt, data=Birth_weight)
rpart.plot::prp(model_3, type=3)
model_full <- rpart::rpart(baby_wt ~ ., data=Birth_weight)
rpart.plot::prp(model_full, type=3)
model_gest <- rpart::rpart(gestation ~ . -baby_wt, data=Birth_weight)
rpart.plot::prp(model_gest, type=3)
Chapter 5 - Covariates and Effect Size
Covariates and uses for models - making predictions with available data, exploring a large/complex dataset, anticipate outcome of intervention:
Effect size - how much does the model output change for a given change in the input?
Example code includes:
data(Houses_for_sale, package="statisticalModeling")
# Train the model price ~ fireplaces
simple_model <- lm(price ~ fireplaces, data = Houses_for_sale)
# Evaluate simple_model
statisticalModeling::evaluate_model(simple_model)
## fireplaces model_output
## 1 0 171823.9
## 2 1 238522.7
naive_worth <- 238522.7 - 171823.9
naive_worth
## [1] 66698.8
# Train another model including living_area
sophisticated_model <-lm(price ~ fireplaces + living_area, data = Houses_for_sale)
# Evaluate that model
statisticalModeling::evaluate_model(sophisticated_model)
## fireplaces living_area model_output
## 1 0 1000 124043.6
## 2 1 1000 133006.1
## 3 0 2000 233357.1
## 4 1 2000 242319.5
## 5 0 3000 342670.6
## 6 1 3000 351633.0
# Find price difference for fixed living_area
sophisticated_worth <- 242319.5 - 233357.1
sophisticated_worth
## [1] 8962.4
data(Crime, package="statisticalModeling")
# Train model_1 and model_2
model_1 <- lm(R ~ X, data = Crime)
model_2 <- lm(R ~ W, data = Crime)
# Evaluate each model...
statisticalModeling::evaluate_model(model_1)
## X model_output
## 1 100 106.82223
## 2 200 89.46721
## 3 300 72.11219
statisticalModeling::evaluate_model(model_2)
## W model_output
## 1 400 68.32909
## 2 600 103.70777
## 3 800 139.08644
change_with_X <- 89.46721 - 106.82223
change_with_X
## [1] -17.35502
change_with_W <- 103.70777 - 68.32909
change_with_W
## [1] 35.37868
# Train model_3 using both X and W as explanatory variables
model_3 <- lm(R ~ X + W, data = Crime)
# Evaluate model_3
statisticalModeling::evaluate_model(model_3)
## X W model_output
## 1 100 400 -62.60510
## 2 200 400 31.03422
## 3 300 400 124.67354
## 4 100 600 41.22502
## 5 200 600 134.86434
## 6 300 600 228.50366
## 7 100 800 145.05515
## 8 200 800 238.69447
## 9 300 800 332.33379
# Find the difference in output for each of X and W
change_with_X_holding_W_constant <- 134.86434 - 228.50366
change_with_X_holding_W_constant
## [1] -93.63932
change_with_W_holding_X_constant <- 134.86434 - 31.03422
change_with_W_holding_X_constant
## [1] 103.8301
data(Trucking_jobs, package="statisticalModeling")
# Train the five models
model_1 <- lm(earnings ~ sex, data = Trucking_jobs)
model_2 <- lm(earnings ~ sex + age, data = Trucking_jobs)
model_3 <- lm(earnings ~ sex + hiredyears, data = Trucking_jobs)
model_4 <- lm(earnings ~ sex + title, data = Trucking_jobs)
model_5 <- lm(earnings ~ sex + age + hiredyears + title, data = Trucking_jobs)
# Evaluate each model...
statisticalModeling::evaluate_model(model_1)
## sex model_output
## 1 M 40236.35
## 2 F 35501.25
statisticalModeling::evaluate_model(model_2, age = 40)
## sex age model_output
## 1 M 40 41077.03
## 2 F 40 38722.71
statisticalModeling::evaluate_model(model_3, hiredyears = 5)
## sex hiredyears model_output
## 1 M 5 39996.93
## 2 F 5 36366.89
statisticalModeling::evaluate_model(model_4, title = "REGL CARRIER REP")
## sex title model_output
## 1 M REGL CARRIER REP 27838.38
## 2 F REGL CARRIER REP 28170.71
statisticalModeling::evaluate_model(model_5, age = 40, hiredyears = 5,
title = "REGL CARRIER REP")
## sex age hiredyears title model_output
## 1 M 40 5 REGL CARRIER REP 30976.42
## 2 F 40 5 REGL CARRIER REP 30991.70
# ...and calculate the gender difference in earnings
diff_1 <- 40236.35 - 35501.25
diff_1
## [1] 4735.1
diff_2 <- 41077.03 - 38722.71
diff_2
## [1] 2354.32
diff_3 <- 39996.93 - 36366.89
diff_3
## [1] 3630.04
diff_4 <- 27838.38 - 28170.71
diff_4
## [1] -332.33
diff_5 <- 30976.42 - 30991.70
diff_5
## [1] -15.28
data(AARP, package="statisticalModeling")
modLin <- lm(Cost ~ Age + Sex + Coverage, data=AARP)
statisticalModeling::evaluate_model(modLin)
## Age Sex Coverage model_output
## 1 40 F 0 -66.4702087
## 2 60 F 0 0.5638866
## 3 80 F 0 67.5979818
## 4 40 M 0 -56.2374309
## 5 60 M 0 10.7966643
## 6 80 M 0 77.8307596
## 7 40 F 50 24.2980606
## 8 60 F 50 91.3321558
## 9 80 F 50 158.3662510
## 10 40 M 50 34.5308383
## 11 60 M 50 101.5649336
## 12 80 M 50 168.5990288
statisticalModeling::effect_size(modLin, ~ Age)
## slope Age to:Age Sex Coverage
## 1 3.351705 59.5 68.16025 F 20
statisticalModeling::effect_size(modLin, ~ Sex)
## change Sex to:Sex Age Coverage
## 1 10.23278 F M 59.5 20
statisticalModeling::effect_size(modLin, ~ Coverage)
## slope Coverage to:Coverage Age Sex
## 1 1.815365 20 37.23783 59.5 F
data(College_grades, package="statisticalModeling")
# Calculating the GPA
gpa_mod_1 <- lm(gradepoint ~ sid, data = College_grades)
# The GPA for two students
statisticalModeling::evaluate_model(gpa_mod_1, sid = c("S32115", "S32262"))
## sid model_output
## 1 S32115 3.448571
## 2 S32262 3.442500
# Use effect_size()
statisticalModeling::effect_size(gpa_mod_1, ~ sid)
## change sid to:sid
## 1 0.4886364 S32259 S32364
# Specify from and to levels to compare
statisticalModeling::effect_size(gpa_mod_1, ~ sid, sid = "S32115", to = "S32262")
## change sid to:sid
## 1 -0.006071429 S32115 S32262
# A better model?
gpa_mod_2 <- lm(gradepoint ~ sid + dept + level, data = College_grades)
# Find difference between the same two students as before
statisticalModeling::effect_size(gpa_mod_2, ~ sid, sid = "S32115", to = "S32262")
## change sid to:sid dept level
## 1 0.4216295 S32115 S32262 d 200
data(Houses_for_sale, package="statisticalModeling")
modAll <- lm(price ~ living_area + land_value + fireplaces, data=Houses_for_sale)
statisticalModeling::effect_size(modAll, ~ land_value)
## slope land_value to:land_value living_area fireplaces
## 1 0.9559322 25000 60021.17 1634.5 1
statisticalModeling::effect_size(modAll, ~ fireplaces)
## slope fireplaces to:fireplaces living_area land_value
## 1 8100.298 1 1.556102 1634.5 25000
statisticalModeling::effect_size(modAll, ~ living_area)
## slope living_area to:living_area land_value fireplaces
## 1 86.81317 1634.5 2254.436 25000 1
Chapter 1 - Effect Size and Interaction
Multiple explanatory variables - commonly use mean/median for each continuous variable, and most common for categorical:
Categorical response variables - output is a classification rather than continuous:
Interactions among explanatory variables:
Example code includes:
data(Houses_for_sale, package="statisticalModeling")
# Build your model
my_model <- rpart::rpart(price ~ living_area + bathrooms + pct_college,
data = Houses_for_sale)
# Graph the model
statisticalModeling::fmodel(my_model, ~ living_area + bathrooms + pct_college)
data(NHANES, package="NHANES")
# Build the model
mod <- lm(Pulse ~ Height + BMI + Gender, data = NHANES)
# Confirm by reconstructing the graphic provided
statisticalModeling::fmodel(mod, ~ Height + BMI + Gender) +
ggplot2::ylab("Pulse")
# Find effect size
statisticalModeling::effect_size(mod, ~ BMI)
## slope BMI to:BMI Height Gender
## 1 0.06025728 25.98 33.35658 166 female
# Replot the model
statisticalModeling::fmodel(mod, ~ BMI + Height + Gender) +
ggplot2::ylab("Pulse")
model_1 <- rpart::rpart(start_position ~ age + sex + nruns,
data = Runners, cp = 0.001)
as_class <- statisticalModeling::evaluate_model(model_1, type = "class")
as_prob <- statisticalModeling::evaluate_model(model_1)
# Calculate effect size with respect to sex
statisticalModeling::effect_size(model_1, ~ sex)
## change.calm change.eager change.mellow sex to:sex age nruns
## 1 0.01281487 -0.2192357 0.2064208 M F 40 4
# Calculate effect size with respect to age
statisticalModeling::effect_size(model_1, ~ age)
## slope.calm slope.eager slope.mellow age to:age sex nruns
## 1 0.00497811 -0.01316334 0.008185229 40 50.84185 M 4
# Calculate effect size with respect to nruns
statisticalModeling::effect_size(model_1, ~ nruns)
## slope.calm slope.eager slope.mellow nruns to:nruns age sex
## 1 0.004900487 0.02725955 -0.03216004 4 5.734239 40 M
data(Whickham, package="mosaicData")
# An rpart model
mod1 <- rpart::rpart(outcome ~ age + smoker, data = Whickham)
# Logistic regression
mod2 <- glm(outcome == "Alive" ~ age + smoker,
data = Whickham, family = "binomial")
# Visualize the models with fmodel()
statisticalModeling::fmodel(mod1)
statisticalModeling::fmodel(mod2)
# Find the effect size of smoker
statisticalModeling::effect_size(mod1, ~ smoker)
## change.Alive change.Dead smoker to:smoker age
## 1 0 0 No Yes 46
statisticalModeling::effect_size(mod2, ~ smoker)
## change smoker to:smoker age
## 1 -0.02479699 No Yes 46
data(Birth_weight, package="statisticalModeling")
# Build the model without interaction
mod1 <- lm(baby_wt ~ gestation + smoke, data=Birth_weight)
# Build the model with interaction
mod2 <- lm(baby_wt ~ gestation * smoke, data=Birth_weight)
# Plot each model
statisticalModeling::fmodel(mod1) +
ggplot2::ylab("baby_wt")
statisticalModeling::fmodel(mod2) +
ggplot2::ylab("baby_wt")
data(Used_Fords, package="statisticalModeling")
# Train model_1
model_1 <- lm(Price ~ Age + Mileage,
data = Used_Fords)
# Train model_2
model_2 <- lm(Price ~ Age * Mileage,
data = Used_Fords)
# Plot both models
statisticalModeling::fmodel(model_1)
statisticalModeling::fmodel(model_2)
# Cross validate and compare prediction errors
res <- statisticalModeling::cv_pred_error(model_1, model_2)
t.test(mse ~ model, data = res)
##
## Welch Two Sample t-test
##
## data: mse by model
## t = 283.32, df = 6.0528, p-value = 1.038e-13
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 2424655 2466812
## sample estimates:
## mean in group model_1 mean in group model_2
## 6100726 3654992
Chapter 2 - Total and Partial Change
Interpreting effect size - magnitude is important, but only if interpreted properly (e.g., units per):
R-squared is also known as the “coefficient of determination” and uses a capital R:
Degrees of freedom - Kaggle example based on restaurant data (137 x 40 with City, City.Group, Type, PS1-PS37 and a 137x1 vector Revenue):
Example code includes:
data(Houses_for_sale, package="statisticalModeling")
# Train a model of house prices
price_model_1 <- lm(price ~ land_value + living_area + fireplaces + bathrooms + bedrooms,
data = Houses_for_sale
)
# Effect size of living area
statisticalModeling::effect_size(price_model_1, ~ living_area)
## slope living_area to:living_area land_value fireplaces bathrooms
## 1 76.06617 1634.5 2254.436 25000 1 2
## bedrooms
## 1 3
# Effect size of bathrooms
statisticalModeling::effect_size(price_model_1, ~ bathrooms, step=1)
## slope bathrooms to:bathrooms land_value living_area fireplaces
## 1 26156.43 2 3 25000 1634.5 1
## bedrooms
## 1 3
# Effect size of bedrooms
statisticalModeling::effect_size(price_model_1, ~ bedrooms, step=1)
## slope bedrooms to:bedrooms land_value living_area fireplaces
## 1 -8222.853 3 4 25000 1634.5 1
## bathrooms
## 1 2
# Let living_area change as it will
price_model_2 <- lm(price ~ land_value + fireplaces + bathrooms + bedrooms,
data = Houses_for_sale
)
# Effect size of bedroom in price_model_2
statisticalModeling::effect_size(price_model_2, ~ bedrooms, step=1)
## slope bedrooms to:bedrooms land_value fireplaces bathrooms
## 1 13882.42 3 4 25000 1 2
# Train a model of house prices
price_model <- lm(price ~ land_value + living_area + fireplaces + bathrooms + bedrooms,
data = Houses_for_sale
)
# Evaluate the model in scenario 1
statisticalModeling::evaluate_model(price_model, living_area = 2000, bedrooms = 2, bathrooms = 1)
## land_value living_area fireplaces bathrooms bedrooms model_output
## 1 0 2000 0 1 2 181624.0
## 2 50000 2000 0 1 2 228787.1
## 3 0 2000 1 1 2 185499.2
## 4 50000 2000 1 1 2 232662.4
# Evaluate the model in scenario 2
statisticalModeling::evaluate_model(price_model, living_area = 2140, bedrooms = 3, bathrooms = 1)
## land_value living_area fireplaces bathrooms bedrooms model_output
## 1 0 2140 0 1 3 184050.4
## 2 50000 2140 0 1 3 231213.5
## 3 0 2140 1 1 3 187925.7
## 4 50000 2140 1 1 3 235088.8
# Find the difference in output
price_diff <- 231213.5 - 228787.1
price_diff
## [1] 2426.4
# Evaluate the second scenario again, but add a half bath
statisticalModeling::evaluate_model(price_model, living_area = 2165, bedrooms = 3, bathrooms = 1.5)
## land_value living_area fireplaces bathrooms bedrooms model_output
## 1 0 2165 0 1.5 3 199030.3
## 2 50000 2165 0 1.5 3 246193.4
## 3 0 2165 1 1.5 3 202905.5
## 4 50000 2165 1 1.5 3 250068.7
# Calculate the price difference
new_price_diff <- 246193.4 - 228787.1
new_price_diff
## [1] 17406.3
# Fit model
car_price_model <- lm(Price ~ Age + Mileage, data = Used_Fords)
# Partial effect size
statisticalModeling::effect_size(car_price_model, ~ Age)
## slope Age to:Age Mileage
## 1 -573.5044 3 6.284152 48897.5
# To find total effect size
statisticalModeling::evaluate_model(car_price_model, Age = 6, Mileage = 42000)
## Age Mileage model_output
## 1 6 42000 9523.781
statisticalModeling::evaluate_model(car_price_model, Age = 7, Mileage = 50000)
## Age Mileage model_output
## 1 7 50000 8400.389
# Price difference between scenarios (round to nearest dollar)
price_difference <- 8400 - 9524
price_difference
## [1] -1124
# Effect for age without mileage in the model
car_price_model_2 <- lm(Price ~ Age, data = Used_Fords)
# Calculate partial effect size
statisticalModeling::effect_size(car_price_model_2, ~ Age)
## slope Age to:Age
## 1 -1124.556 3 6.284152
data(College_grades, package="statisticalModeling")
data(AARP, package="statisticalModeling")
data(Tadpoles, package="statisticalModeling")
College_grades <- College_grades[complete.cases(College_grades), ]
# Train some models
model_1 <- lm(gradepoint ~ sid, data = College_grades)
model_2 <- lm(Cost ~ Age + Sex + Coverage, data = AARP)
model_3 <- lm(vmax ~ group + (rtemp + I(rtemp^2)), data = Tadpoles)
# Calculate model output on training data
output_1 <- statisticalModeling::evaluate_model(model_1, data = College_grades)
output_2 <- statisticalModeling::evaluate_model(model_2, data = AARP)
output_3 <- statisticalModeling::evaluate_model(model_3, data = Tadpoles)
# R-squared for the models
with(output_1, var(model_output) / var(gradepoint))
## [1] 0.3222716
with(output_2, var(model_output) / var(Cost))
## [1] 0.8062783
with(output_3, var(model_output) / var(vmax))
## [1] 0.4310651
data(HDD_Minneapolis, package="statisticalModeling")
# The two models
model_1 <- lm(hdd ~ year, data = HDD_Minneapolis)
model_2 <- lm(hdd ~ month, data = HDD_Minneapolis)
# Find the model output on the training data for each model
output_1 <- statisticalModeling::evaluate_model(model_1, data = HDD_Minneapolis)
output_2 <- statisticalModeling::evaluate_model(model_2, data = HDD_Minneapolis)
# Find R-squared for each of the 2 models
with(output_1, var(model_output) / var(hdd))
## [1] 0.0001121255
with(output_2, var(model_output) / var(hdd))
## [1] 0.9547171
# DO NOT HAVE THIS DATASET - Training is 267 x 12 (field 12 is "bogus", a 267x200 matrix of random numbers)
# Train model_1 without bogus
# model_1 <- lm(wage ~ sector, data = Training)
# Train model_2 with bogus
# model_2 <- lm(wage ~ sector + bogus, data = Training)
# Calculate R-squared using the training data
# output_1 <- statisticalModeling::evaluate_model(model_1, data = Training)
# output_2 <- statisticalModeling::evaluate_model(model_2, data = Training)
# with(output_1, var(model_output) / var(wage))
# with(output_2, var(model_output) / var(wage))
# Compare cross-validated MSE
# boxplot(mse ~ model, data = statisticalModeling::cv_pred_error(model_1, model_2))
data(CPS85, package="mosaicData")
# Train the four models
model_0 <- lm(wage ~ NULL, data = CPS85)
model_1 <- lm(wage ~ mosaic::rand(100), data = CPS85)
model_2 <- lm(wage ~ mosaic::rand(200), data = CPS85)
model_3 <- lm(wage ~ mosaic::rand(300), data = CPS85)
# Evaluate the models on the training data
output_0 <- statisticalModeling::evaluate_model(model_0, on_training = TRUE)
output_1 <- statisticalModeling::evaluate_model(model_1, on_training = TRUE)
output_2 <- statisticalModeling::evaluate_model(model_2, on_training = TRUE)
output_3 <- statisticalModeling::evaluate_model(model_3, on_training = TRUE)
# Compute R-squared for each model
with(output_0, var(model_output) / var(wage))
## [1] 0
with(output_1, var(model_output) / var(wage))
## [1] 0.2302304
with(output_2, var(model_output) / var(wage))
## [1] 0.3830196
with(output_3, var(model_output) / var(wage))
## [1] 0.5730115
# Compare the null model to model_3 using cross validation
cv_results <- statisticalModeling::cv_pred_error(model_0, model_3, ntrials = 3)
boxplot(mse ~ model, data = cv_results)
# Train this model with 24 degrees of freedom
model_1 <- lm(hdd ~ year * month, data = HDD_Minneapolis)
# Calculate R-squared
output_1 <- statisticalModeling::evaluate_model(model_1, data = HDD_Minneapolis)
with(output_1, var(model_output) / var(hdd))
## [1] 0.9554951
# Oops! Numerical year changed to categorical
HDD_Minneapolis$categorical_year <- as.character(HDD_Minneapolis$year)
# This model has many more degrees of freedom
model_2 <- lm(hdd ~ categorical_year * month, data = HDD_Minneapolis)
# Calculate R-squared
output_2 <- statisticalModeling::evaluate_model(model_2, data = HDD_Minneapolis)
## Warning in predict.lm(structure(list(coefficients =
## structure(c(580.000000000084, : prediction from a rank-deficient fit may be
## misleading
with(output_2, var(model_output) / var(hdd))
## [1] 1
Chapter 3 - Sampling Variability
Bootstrapping and precision - applying CI and the like to assess the precision of statistical models:
Scales and transformations - what do the numbers actually represent?
Example code includes:
data(CPS85, package="mosaicData")
# Two starting elements
model <- lm(wage ~ age + sector, data = CPS85)
statisticalModeling::effect_size(model, ~ age)
## slope age to:age sector
## 1 0.07362793 35 46.72657 prof
# For practice
my_test_resample <- sample(1:10, replace = TRUE)
my_test_resample
## [1] 4 10 5 8 9 6 4 6 3 1
# Construct a resampling of CPS85
trial_1_indices <- sample(1:nrow(CPS85), replace = TRUE)
trial_1_data <- CPS85[trial_1_indices, ]
# Train the model to that resampling
trial_1_model <- lm(wage ~ age + sector, data = trial_1_data)
# Calculate the quantity
statisticalModeling::effect_size(trial_1_model, ~ age)
## slope age to:age sector
## 1 0.09237306 35 47.07791 prof
# Model and effect size from the "real" data
model <- lm(wage ~ age + sector, data = CPS85)
statisticalModeling::effect_size(model, ~ age)
## slope age to:age sector
## 1 0.07362793 35 46.72657 prof
# Generate 10 resampling trials
my_trials <- statisticalModeling::ensemble(model, nreps = 10)
# Find the effect size for each trial
statisticalModeling::effect_size(my_trials, ~ age)
## slope age to:age sector bootstrap_rep
## 1 0.06688941 35 46.72657 prof 1
## 11 0.02485699 35 46.72657 prof 2
## 12 0.04985098 35 46.72657 prof 3
## 13 0.08265600 35 46.72657 prof 4
## 14 0.10400812 35 46.72657 prof 5
## 15 0.07303927 35 46.72657 prof 6
## 16 0.08080486 35 46.72657 prof 7
## 17 0.10319090 35 46.72657 prof 8
## 18 0.08344260 35 46.72657 prof 9
## 19 0.09949821 35 46.72657 prof 10
# Re-do with 100 trials
my_trials <- statisticalModeling::ensemble(model, nreps = 100)
trial_effect_sizes <- statisticalModeling::effect_size(my_trials, ~ age)
# Calculate the standard deviation of the 100 effect sizes
sd(trial_effect_sizes$slope)
## [1] 0.0189476
# An estimate of the value of a fireplace
model <- lm(price ~ land_value + fireplaces + living_area,
data = Houses_for_sale
)
statisticalModeling::effect_size(model, ~ fireplaces)
## slope fireplaces to:fireplaces land_value living_area
## 1 8100.298 1 1.556102 25000 1634.5
# Generate 100 resampling trials
trials <- statisticalModeling::ensemble(model, nreps = 100)
# Calculate the effect size in each of the trials
effect_sizes_in_trials <- statisticalModeling::effect_size(trials, ~ fireplaces)
# Show a histogram of the effect sizes
hist(effect_sizes_in_trials$slope)
# Calculate the standard error
sd(effect_sizes_in_trials$slope)
## [1] 3529.064
data(AARP, package="statisticalModeling")
# Make model with log(Cost)
mod_1 <- lm(log(Cost) ~ Age + Sex + Coverage, data = AARP)
mod_2 <- lm(log(Cost) ~ Age * Sex + Coverage, data = AARP)
mod_3 <- lm(log(Cost) ~ Age * Sex + log(Coverage), data = AARP)
mod_4 <- lm(log(Cost) ~ Age * Sex * log(Coverage), data = AARP)
# To display each model in turn
statisticalModeling::fmodel(mod_1, ~ Age + Sex + Coverage,
Coverage = c(10, 20, 50)) +
ggplot2::geom_point(data = AARP, alpha = 0.5,
aes(y = log(Cost), color = Sex))
statisticalModeling::fmodel(mod_2, ~ Age + Sex + Coverage,
Coverage = c(10, 20, 50)) +
ggplot2::geom_point(data = AARP, alpha = 0.5,
aes(y = log(Cost), color = Sex))
statisticalModeling::fmodel(mod_3, ~ Age + Sex + Coverage,
Coverage = c(10, 20, 50)) +
ggplot2::geom_point(data = AARP, alpha = 0.5,
aes(y = log(Cost), color = Sex))
statisticalModeling::fmodel(mod_4, ~ Age + Sex + Coverage,
Coverage = c(10, 20, 50)) +
ggplot2::geom_point(data = AARP, alpha = 0.5,
aes(y = log(Cost), color = Sex))
# Use cross validation to compare mod_4 and mod_1
results <- statisticalModeling::cv_pred_error(mod_1, mod_4)
boxplot(mse ~ model, data = results)
data(Oil_history, package="statisticalModeling")
str(Oil_history)
## Classes 'tbl_df', 'tbl' and 'data.frame': 63 obs. of 2 variables:
## $ year: int 1880 1890 1900 1905 1910 1915 1920 1925 1930 1935 ...
## $ mbbl: num 30 77 149 215 328 ...
Oil_production <- Oil_history %>%
filter(year <= 1968) %>%
mutate(log_mbbl=log(mbbl))
str(Oil_production)
## Classes 'tbl_df', 'tbl' and 'data.frame': 19 obs. of 3 variables:
## $ year : int 1880 1890 1900 1905 1910 1915 1920 1925 1930 1935 ...
## $ mbbl : num 30 77 149 215 328 ...
## $ log_mbbl: num 3.4 4.34 5 5.37 5.79 ...
ggplot(Oil_production, aes(x=year, y=mbbl)) +
geom_point() +
geom_line()
# Model of oil production in mbbl
model_1 <- lm(mbbl ~ year, data = Oil_production)
# Plot model_1 with scatterplot of mbbl vs. year
statisticalModeling::fmodel(model_1, data = Oil_production) +
geom_point(data = Oil_production)
# Effect size of year
statisticalModeling::effect_size(model_1, ~ year)
## slope year to:year
## 1 140.3847 1935 1962.324
# Model of log-transformed production
model_2 <- lm(log_mbbl ~ year, data = Oil_production)
# Plot model_2 with scatterplot of mbbl vs. year
statisticalModeling::fmodel(model_2, data = Oil_production) +
geom_point(data = Oil_production)
# And the effect size on log-transformed production
statisticalModeling::effect_size(model_2, ~ year)
## slope year to:year
## 1 0.06636971 1935 1962.324
# Annual growth
100 * (exp(round(0.06637, 3)) - 1)
## [1] 6.822672
data(Used_Fords, package="statisticalModeling")
# A model of price
model_1 <- lm(Price ~ Mileage + Age, data = Used_Fords)
# A model of logarithmically transformed price
Used_Fords$log_price <- log(Used_Fords$Price)
model_2 <- lm(log_price ~ Mileage + Age, data = Used_Fords)
# The model values on the original cases
preds_1 <- statisticalModeling::evaluate_model(model_1, data = Used_Fords)
# The model output for model_2 - giving log price
preds_2 <- statisticalModeling::evaluate_model(model_2, data = Used_Fords)
# Transform predicted log price to price
preds_2$model_price <- exp(preds_2$model_output)
# Mean square errors in price
mean((preds_1$Price - preds_1$model_output)^2, na.rm = TRUE)
## [1] 6026231
mean((preds_2$Price - preds_2$model_price)^2, na.rm = TRUE)
## [1] 3711549
data(Used_Fords, package="statisticalModeling")
# A model of logarithmically transformed price
model <- lm(log(Price) ~ Mileage + Age, data = Used_Fords)
# Create the bootstrap replications
bootstrap_reps <- statisticalModeling::ensemble(model, nreps = 100, data = Used_Fords)
# Find the effect size
age_effect <- statisticalModeling::effect_size(bootstrap_reps, ~ Age)
# Change the slope to a percent change
age_effect$percent_change <- 100 * (exp(age_effect$slope) - 1)
# Find confidence interval
with(age_effect, mean(percent_change) + c(-2, 2) * sd(percent_change))
## [1] -9.218384 -7.342702
Chapter 4 - Variables Working Together
Confidence and collinearity - managing covariates appropriately to reflect mechanisms of the real-world:
Example code includes:
data(CPS85, package="mosaicData")
# A model of wage
model_1 <- lm(wage ~ educ + sector + exper + age, data = CPS85)
# Effect size of educ on wage
statisticalModeling::effect_size(model_1, ~ educ)
## slope educ to:educ sector exper age
## 1 0.5732615 12 14.61537 prof 15 35
# Examine confidence interval on effect size
ensemble_1 <- statisticalModeling::ensemble(model_1, nreps = 100)
effect_from_1 <- suppressWarnings(statisticalModeling::effect_size(ensemble_1, ~ educ))
with(effect_from_1, mean(slope) + c(-2, 2) * sd(slope))
## [1] 0.2637031 1.0105522
# Collinearity inflation factor on standard error
statisticalModeling::collinearity( ~ educ + sector + exper + age, data = CPS85)
## expl_vars SeIF
## 1 educ 15.273900
## 2 sectorconst 1.090245
## 3 sectormanag 1.215769
## 4 sectormanuf 1.252303
## 5 sectorother 1.239831
## 6 sectorprof 1.405901
## 7 sectorsales 1.137992
## 8 sectorservice 1.274175
## 9 exper 71.980564
## 10 age 68.116772
# Leave out covariates one at a time
statisticalModeling::collinearity( ~ educ + sector + exper, data = CPS85) # leave out age
## expl_vars SeIF
## 1 educ 1.380220
## 2 sectorconst 1.090245
## 3 sectormanag 1.215761
## 4 sectormanuf 1.252303
## 5 sectorother 1.239814
## 6 sectorprof 1.402902
## 7 sectorsales 1.137990
## 8 sectorservice 1.274174
## 9 exper 1.092803
statisticalModeling::collinearity( ~ educ + sector + age, data = CPS85) # leave out exper
## expl_vars SeIF
## 1 educ 1.311022
## 2 sectorconst 1.090245
## 3 sectormanag 1.215754
## 4 sectormanuf 1.252302
## 5 sectorother 1.239801
## 6 sectorprof 1.402764
## 7 sectorsales 1.137990
## 8 sectorservice 1.274174
## 9 age 1.034143
statisticalModeling::collinearity( ~ educ + exper + age, data = CPS85) # leave out sector
## expl_vars SeIF
## 1 educ 15.15169
## 2 exper 71.74900
## 3 age 67.90730
# Improved model leaving out worst offending covariate
model_2 <- lm(wage ~ educ + sector + age, data = CPS85)
# Confidence interval of effect size of educ on wage
ensemble_2 <- statisticalModeling::ensemble(model_2, nreps = 100)
effect_from_2 <- statisticalModeling::effect_size(ensemble_2, ~ educ)
with(effect_from_2, mean(slope) + c(-2, 2) * sd(slope))
## [1] 0.4726406 0.8194584
data(Used_Fords, package="statisticalModeling")
# Train a model Price ~ Age + Mileage
model_1 <- lm(Price ~ Age + Mileage, data = Used_Fords)
# Train a similar model including the interaction
model_2 <- lm(Price ~ Age * Mileage, data = Used_Fords)
# Compare cross-validated prediction error
statisticalModeling::cv_pred_error(model_1, model_2)
## mse model
## 1 6159401 model_1
## 2 6106128 model_1
## 3 6093206 model_1
## 4 6095334 model_1
## 5 6082699 model_1
## 6 3633939 model_2
## 7 3661556 model_2
## 8 3654686 model_2
## 9 3641057 model_2
## 10 3673792 model_2
# Use bootstrapping to find conf. interval on effect size of Age
ensemble_1 <- statisticalModeling::ensemble(model_1, nreps = 100)
ensemble_2 <- statisticalModeling::ensemble(model_2, nreps = 100)
effect_from_1 <- statisticalModeling::effect_size(ensemble_1, ~ Age)
effect_from_2 <- statisticalModeling::effect_size(ensemble_2, ~ Age)
with(effect_from_1, mean(slope) + c(-2, 2) * sd(slope))
## [1] -662.1550 -497.6593
with(effect_from_2, mean(slope) + c(-2, 2) * sd(slope))
## [1] -958.2758 -796.4079
# Compare inflation for the model with and without interaction
statisticalModeling::collinearity(~ Age + Mileage, data = Used_Fords)
## expl_vars SeIF
## 1 Age 1.5899
## 2 Mileage 1.5899
statisticalModeling::collinearity(~ Age * Mileage, data = Used_Fords)
## expl_vars SeIF
## 1 Age 2.510430
## 2 Mileage 2.147278
## 3 Age:Mileage 3.349224
Chapter 1 - Exploratory Time Series Data Analysis
Time series is a sequence of data in chronological order (recorded sequentially over time), especially common in finance and economics:
Sampling frequency - some time series data is evenly spaced, other time series data is only approximately evenly spaced:
Basic time series objects - start with a vector of numbers, add an index using the ts() or other functions:
Example code includes:
data(Nile, package="datasets")
# Print the Nile dataset
print(Nile)
## Time Series:
## Start = 1871
## End = 1970
## Frequency = 1
## [1] 1120 1160 963 1210 1160 1160 813 1230 1370 1140 995 935 1110 994
## [15] 1020 960 1180 799 958 1140 1100 1210 1150 1250 1260 1220 1030 1100
## [29] 774 840 874 694 940 833 701 916 692 1020 1050 969 831 726
## [43] 456 824 702 1120 1100 832 764 821 768 845 864 862 698 845
## [57] 744 796 1040 759 781 865 845 944 984 897 822 1010 771 676
## [71] 649 846 812 742 801 1040 860 874 848 890 744 749 838 1050
## [85] 918 986 797 923 975 815 1020 906 901 1170 912 746 919 718
## [99] 714 740
# List the number of observations in the Nile dataset
length(Nile)
## [1] 100
# Display the first 10 elements of the Nile dataset
head(Nile, n=10)
## [1] 1120 1160 963 1210 1160 1160 813 1230 1370 1140
# Display the last 12 elements of the Nile dataset
tail(Nile, n=12)
## [1] 975 815 1020 906 901 1170 912 746 919 718 714 740
# Plot the Nile data
plot(Nile)
# Plot the Nile data with xlab and ylab arguments
plot(Nile, xlab = "Year", ylab = "River Volume (1e9 m^{3})")
# Plot the Nile data with xlab, ylab, main, and type arguments
plot(Nile, xlab = "Year", ylab = "River Volume (1e9 m^{3})",
main="Annual River Nile Volume at Aswan, 1871-1970", type="b"
)
continuous_series <- c( 0.5689 , 0.7663 , 0.9921 , 0.9748 , 0.3991 , 0.3766 , -0.3853 , -0.8364 , -0.9997 , -0.9983 , -0.6462 , -0.0939 , 0.4005 , 0.6816 , 0.9532 , 0.9969 , 0.8393 , 0.37 , -0.2551 , -0.6174 )
continuous_time_index <- c( 1.2103 , 1.7461 , 2.8896 , 3.5914 , 5.4621 , 5.5109 , 7.0743 , 8.2644 , 9.3734 , 9.5411 , 11.1611 , 12.3784 , 13.3906 , 14.0663 , 15.0935 , 15.8645 , 16.8574 , 18.0915 , 19.3655 , 20.1805 )
# Plot the continuous_series using continuous time indexing
par(mfrow=c(2,1))
plot(continuous_time_index, continuous_series, type = "b")
# Make a discrete time index using 1:20
discrete_time_index <- 1:20
# Now plot the continuous_series using discrete time indexing
plot(discrete_time_index, continuous_series, type = "b")
par(mfrow=c(1, 1))
data(AirPassengers, package="datasets")
str(AirPassengers)
## Time-Series [1:144] from 1949 to 1961: 112 118 132 129 121 135 148 148 136 119 ...
# Plot AirPassengers
plot(AirPassengers)
# View the start and end dates of AirPassengers
start(AirPassengers)
## [1] 1949 1
end(AirPassengers)
## [1] 1960 12
# Use time(), deltat(), frequency(), and cycle() with AirPassengers
time(AirPassengers)
## Jan Feb Mar Apr May Jun Jul
## 1949 1949.000 1949.083 1949.167 1949.250 1949.333 1949.417 1949.500
## 1950 1950.000 1950.083 1950.167 1950.250 1950.333 1950.417 1950.500
## 1951 1951.000 1951.083 1951.167 1951.250 1951.333 1951.417 1951.500
## 1952 1952.000 1952.083 1952.167 1952.250 1952.333 1952.417 1952.500
## 1953 1953.000 1953.083 1953.167 1953.250 1953.333 1953.417 1953.500
## 1954 1954.000 1954.083 1954.167 1954.250 1954.333 1954.417 1954.500
## 1955 1955.000 1955.083 1955.167 1955.250 1955.333 1955.417 1955.500
## 1956 1956.000 1956.083 1956.167 1956.250 1956.333 1956.417 1956.500
## 1957 1957.000 1957.083 1957.167 1957.250 1957.333 1957.417 1957.500
## 1958 1958.000 1958.083 1958.167 1958.250 1958.333 1958.417 1958.500
## 1959 1959.000 1959.083 1959.167 1959.250 1959.333 1959.417 1959.500
## 1960 1960.000 1960.083 1960.167 1960.250 1960.333 1960.417 1960.500
## Aug Sep Oct Nov Dec
## 1949 1949.583 1949.667 1949.750 1949.833 1949.917
## 1950 1950.583 1950.667 1950.750 1950.833 1950.917
## 1951 1951.583 1951.667 1951.750 1951.833 1951.917
## 1952 1952.583 1952.667 1952.750 1952.833 1952.917
## 1953 1953.583 1953.667 1953.750 1953.833 1953.917
## 1954 1954.583 1954.667 1954.750 1954.833 1954.917
## 1955 1955.583 1955.667 1955.750 1955.833 1955.917
## 1956 1956.583 1956.667 1956.750 1956.833 1956.917
## 1957 1957.583 1957.667 1957.750 1957.833 1957.917
## 1958 1958.583 1958.667 1958.750 1958.833 1958.917
## 1959 1959.583 1959.667 1959.750 1959.833 1959.917
## 1960 1960.583 1960.667 1960.750 1960.833 1960.917
deltat(AirPassengers)
## [1] 0.08333333
frequency(AirPassengers)
## [1] 12
cycle(AirPassengers)
## Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
## 1949 1 2 3 4 5 6 7 8 9 10 11 12
## 1950 1 2 3 4 5 6 7 8 9 10 11 12
## 1951 1 2 3 4 5 6 7 8 9 10 11 12
## 1952 1 2 3 4 5 6 7 8 9 10 11 12
## 1953 1 2 3 4 5 6 7 8 9 10 11 12
## 1954 1 2 3 4 5 6 7 8 9 10 11 12
## 1955 1 2 3 4 5 6 7 8 9 10 11 12
## 1956 1 2 3 4 5 6 7 8 9 10 11 12
## 1957 1 2 3 4 5 6 7 8 9 10 11 12
## 1958 1 2 3 4 5 6 7 8 9 10 11 12
## 1959 1 2 3 4 5 6 7 8 9 10 11 12
## 1960 1 2 3 4 5 6 7 8 9 10 11 12
# Plot the AirPassengers data
plot(AirPassengers)
# Compute the mean of AirPassengers
mean(AirPassengers, na.rm=TRUE)
## [1] 280.2986
# Impute mean values to NA in AirPassengers
AirPassengers[85:96] <- mean(AirPassengers, na.rm = TRUE)
# Generate another plot of AirPassengers
plot(AirPassengers)
# Add the complete AirPassengers data to your plot
rm(AirPassengers)
points(AirPassengers, type = "l", col = 2, lty = 3)
data_vector <- c( 2.0522 , 4.2929 , 3.3294 , 3.5086 , 0.001 , 1.9217 , 0.7978 , 0.3 , 0.9436 , 0.5748 , -0.0034 , 0.3449 , 2.223 , 0.1763 , 2.7098 , 1.2502 , -0.4007 , 0.8853 , -1.5852 , -2.2829 , -2.561 , -3.126 , -2.866 , -1.7847 , -1.8895 , -2.7255 , -2.1033 , -0.0174 , -0.3613 , -2.9008 , -3.2847 , -2.8685 , -1.9505 , -4.8802 , -3.2635 , -1.6396 , -3.3013 , -2.6331 , -1.7058 , -2.212 , -0.5171 , 0.0753 , -0.8407 , -1.4023 , -0.1382 , -1.4066 , -2.3047 , 1.5074 , 0.7119 , -1.1301 )
# Use print() and plot() to view data_vector
print(data_vector)
## [1] 2.0522 4.2929 3.3294 3.5086 0.0010 1.9217 0.7978 0.3000
## [9] 0.9436 0.5748 -0.0034 0.3449 2.2230 0.1763 2.7098 1.2502
## [17] -0.4007 0.8853 -1.5852 -2.2829 -2.5610 -3.1260 -2.8660 -1.7847
## [25] -1.8895 -2.7255 -2.1033 -0.0174 -0.3613 -2.9008 -3.2847 -2.8685
## [33] -1.9505 -4.8802 -3.2635 -1.6396 -3.3013 -2.6331 -1.7058 -2.2120
## [41] -0.5171 0.0753 -0.8407 -1.4023 -0.1382 -1.4066 -2.3047 1.5074
## [49] 0.7119 -1.1301
plot(data_vector)
# Convert data_vector to a ts object with start = 2004 and frequency = 4
time_series <- ts(data_vector, start=2004, frequency=4)
# Use print() and plot() to view time_series
print(time_series)
## Qtr1 Qtr2 Qtr3 Qtr4
## 2004 2.0522 4.2929 3.3294 3.5086
## 2005 0.0010 1.9217 0.7978 0.3000
## 2006 0.9436 0.5748 -0.0034 0.3449
## 2007 2.2230 0.1763 2.7098 1.2502
## 2008 -0.4007 0.8853 -1.5852 -2.2829
## 2009 -2.5610 -3.1260 -2.8660 -1.7847
## 2010 -1.8895 -2.7255 -2.1033 -0.0174
## 2011 -0.3613 -2.9008 -3.2847 -2.8685
## 2012 -1.9505 -4.8802 -3.2635 -1.6396
## 2013 -3.3013 -2.6331 -1.7058 -2.2120
## 2014 -0.5171 0.0753 -0.8407 -1.4023
## 2015 -0.1382 -1.4066 -2.3047 1.5074
## 2016 0.7119 -1.1301
plot(time_series)
# Check whether data_vector and time_series are ts objects
is.ts(data_vector)
## [1] FALSE
is.ts(time_series)
## [1] TRUE
# Check whether Nile is a ts object
is.ts(Nile)
## [1] TRUE
# Check whether AirPassengers is a ts object
is.ts(AirPassengers)
## [1] TRUE
# DO NOT HAVE eu_stocks - seems to be 1860x4 for 1991/130-1998/169, frequency 260, using DAX, SMI, CAC, FTSE
# Created a smaller mock-up for eu_stocks
numDAX <- c( 1628.8, 1613.6, 1606.5, 1621, 1618.2, 1610.6, 1630.8, 1640.2, 1635.5, 1645.9, 1647.8, 1638.3, 1629.9, 1621.5, 1624.7, 1627.6, 1632, 1621.2, 1613.4, 1605, 1605.8, 1616.7, 1619.3, 1620.5, 1619.7, 1623.1, 1614, 1631.9, 1630.4, 1633.5, 1626.5, 1650.4, 1650.1, 1654.1, 1653.6, 1501.8, 1524.3, 1603.7, 1622.5, 1636.7, 1652.1, 1645.8, 1650.4, 1651.5, 1649.9, 1653.5, 1657.5, 1649.5, 1649.1, 1646.4, 1638.7, 1625.8, 1628.6, 1632.2, 1633.7, 1631.2, 1635.8, 1621.3, 1624.7, 1616.1, 1618.1, 1627.8, 1625.8, 1614.8, 1612.8, 1605.5, 1609.3, 1607.5, 1607.5, 1604.9, 1589.1, 1582.3, 1568, 1568.2, 1569.7, 1571.7, 1585.4, 1570, 1561.9, 1565.2, 1570.3, 1577, 1590.3, 1572.7, 1572.1, 1579.2, 1588.7, 1586, 1579.8, 1572.6, 1568.1, 1578.2, 1573.9, 1582.1, 1610.2, 1605.2, 1623.8, 1615.3, 1627.1, 1627, 1605.7, 1589.7, 1589.7, 1603.3, 1599.8, 1590.9, 1603.5, 1589.9, 1587.9, 1571.1, 1549.8, 1549.4, 1554.7, 1557.5, 1555.3, 1559.8, 1548.4, 1544, 1550.2, 1557, 1551.8, 1562.9, 1570.3, 1559.3, 1545.9, 1542.8, 1542.8, 1542.8, 1542.8, 1564.3, 1577.3, 1577.3, 1577.3, 1598.2, 1604, 1604.7, 1593.7, 1581.7, 1599.1, 1613.8, 1620.5, 1629.5, 1663.7, 1664.1, 1669.3, 1685.1, 1687.1, 1680.1, 1671.8, 1669.5, 1686.7, 1685.5, 1671, 1683.1, 1685.7, 1685.7, 1678.8, 1685.8, 1683.7, 1686.6, 1683.7, 1679.1, 1685, 1680.8, 1676.2, 1688.5, 1696.5, 1690.2, 1711.3, 1711.3, 1729.9, 1716.6, 1743.4, 1745.2, 1746.8, 1749.3, 1763.9, 1762.3, 1762.3, 1746.8, 1753.5, 1753.2, 1739.9, 1723.9, 1734.4, 1723.1, 1732.9, 1729.9, 1725.7, 1730.9, 1714.2, 1716.2, 1719.1, 1718.2, 1698.8, 1714.8, 1718.3, 1706.7, 1723.4, 1716.2, 1738.8, 1737.4, 1714.8, 1724.2, 1733.8, 1730, 1734.5, 1744.3, 1746.9, 1746.9, 1746.9, 1747.5, 1753.1, 1745.2, 1745.7, 1742.9, 1731.7, 1731.2, 1728.1, 1728.1, 1731.3, 1733.8, 1745.8, 1752.6, 1748.1, 1750.7, 1747.9, 1745.8, 1735.3, 1719.9, 1763.6, 1766.8, 1785.4, 1783.6, 1804.4, 1812.3, 1799.5, 1792.8, 1792.8, 1806.4, 1798.2, 1800.6, 1786.2, 1791.3, 1789, 1789, 1784.7, 1789.5, 1779.7, 1787, 1773.2, 1781.6, 1773.8, 1773.8, 1776.3, 1770.7, 1772.4, 1762.5, 1764.3, 1752.8, 1756, 1755, 1759.9, 1759.8, 1776.5, 1770, 1767, 1752.3, 1760.2, 1750.3, 1731.4, 1735.5, 1733.8, 1730.8, 1699.5, 1652.7, 1654.1, 1636.8, 1622.8, 1613.4, 1617.8, 1617.2, 1637.6, 1622.2, 1608.5, 1605.1, 1609.6, 1624.9, 1618.1, 1612, 1579, 1561.4, 1547.9, 1548.6, 1560.2, 1554.8, 1531.9, 1526.1, 1509, 1530, 1485, 1464, 1475.1, 1516.1, 1519.7, 1530, 1516.4, 1515.5, 1543.9, 1534.7, 1538.7, 1536.7, 1523.8, 1527.1, 1530.2, 1601.5, 1580.3, 1595.1, 1579.5, 1600.6, 1566, 1557, 1542.7, 1536.3, 1510.7, 1481, 1483.8, 1470.1, 1484.8, 1475.4, 1402.3, 1421.5, 1434.6, 1446.3, 1437.7, 1441.6, 1471.6, 1454, 1453.8, 1458, 1479.6, 1504.9, 1496.5, 1511, 1528.9, 1534, 1536.6, 1508.2, 1493.5, 1489.7, 1482.4, 1483.3, 1470.6, 1484.8, 1487.7, 1508.6, 1515.3, 1509.8, 1542.3, 1541.8, 1542.5, 1550.3, 1550.3, 1543.4, 1547.8, 1523.6, 1526.7, 1513.4, 1523, 1529.7, 1545.1, 1546.8, 1528.1, 1530.7, 1526.2, 1519.5, 1506.7, 1504.3, 1480.7, 1476.7, 1478.1, 1479.6, 1477.5, 1472.6, 1495.6, 1517.5, 1520.9, 1527.1, 1527.1, 1527.1, 1547.5, 1545.8, 1538.4, 1538.4, 1538.4, 1538, 1554, 1551.2, 1538.4, 1529.1 )
numSMI <- c( 1678.1, 1688.5, 1678.6, 1684.1, 1686.6, 1671.6, 1682.9, 1703.6, 1697.5, 1716.3, 1723.8, 1730.5, 1727.4, 1733.3, 1734, 1728.3, 1737.1, 1723.1, 1723.6, 1719, 1721.2, 1725.3, 1727.2, 1727.2, 1731.6, 1724.1, 1716.9, 1723.4, 1723, 1728.4, 1722.1, 1724.5, 1733.6, 1739, 1726.2, 1587.4, 1630.6, 1685.5, 1701.3, 1718, 1726.2, 1716.6, 1725.8, 1737.4, 1736.6, 1732.4, 1731.2, 1726.9, 1727.8, 1720.2, 1715.4, 1708.7, 1713, 1713.5, 1718, 1701.7, 1701.7, 1684.9, 1687.2, 1690.6, 1684.3, 1679.9, 1672.9, 1663.1, 1669.3, 1664.7, 1672.3, 1687.7, 1686.8, 1686.6, 1675.8, 1677.4, 1673.2, 1665, 1671.3, 1672.4, 1676.2, 1692.6, 1696.5, 1716.1, 1713.3, 1705.1, 1711.3, 1709.8, 1688.6, 1698.9, 1700, 1693, 1683.9, 1679.2, 1673.9, 1683.9, 1688.4, 1693.9, 1720.9, 1717.9, 1733.6, 1729.7, 1735.6, 1734.1, 1699.3, 1678.6, 1675.5, 1670.1, 1652.2, 1635, 1654.9, 1642, 1638.7, 1622.6, 1596.1, 1612.4, 1625, 1610.5, 1606.6, 1610.7, 1603.1, 1591.5, 1605.2, 1621.4, 1622.5, 1626.6, 1627.4, 1614.9, 1602.3, 1598.3, 1627, 1627, 1627, 1655.7, 1670.1, 1670.1, 1670.1, 1670.1, 1704, 1711.8, 1700.5, 1690.3, 1715.4, 1723.5, 1719.4, 1734.4, 1772.8, 1760.3, 1747.2, 1750.2, 1755.3, 1754.6, 1751.2, 1752.5, 1769.4, 1767.6, 1750, 1747.1, 1753.5, 1752.8, 1752.9, 1764.7, 1776.8, 1779.3, 1785.1, 1798.2, 1794.1, 1795.2, 1780.4, 1789.5, 1794.2, 1784.4, 1800.1, 1804, 1816.2, 1810.5, 1821.9, 1828.2, 1840.6, 1841.1, 1846.3, 1850, 1839, 1820.2, 1815.2, 1820.6, 1807.1, 1791.4, 1806.2, 1798.7, 1818.2, 1820.5, 1833.3, 1837.1, 1818.2, 1824.1, 1830.1, 1835.6, 1828.7, 1839.2, 1837.2, 1826.7, 1838, 1829.1, 1843.1, 1850.5, 1827.1, 1829.1, 1848, 1840.5, 1853.8, 1874.1, 1871.3, 1871.3, 1871.3, 1860.5, 1874.7, 1880.1, 1874.7, 1875.6, 1859.5, 1874.2, 1880.1, 1880.1, 1907.7, 1920.5, 1937.3, 1936.8, 1949.1, 1963.7, 1950.8, 1953.5, 1945, 1921.1, 1939.1, 1928, 1933.4, 1925.7, 1931.7, 1928.7, 1924.5, 1914.2, 1914.2, 1920.6, 1923.3, 1930.4, 1915.2, 1916.9, 1913.8, 1913.8, 1899.7, 1888, 1868.8, 1879.9, 1865.7, 1881.3, 1873.1, 1862.5, 1869.3, 1846.9, 1847.1, 1838.3, 1845.8, 1835.5, 1846.6, 1854.8, 1845.3, 1854.5, 1870.5, 1862.6, 1856.6, 1837.6, 1846.7, 1856.5, 1841.8, 1835, 1844.4, 1838.9, 1805.6, 1756.6, 1786.1, 1757.1, 1762.8, 1756.8, 1761.9, 1778.5, 1812.7, 1806.1, 1798.1, 1794.9, 1805.4, 1820.3, 1819.6, 1809.6, 1799.9, 1800.3, 1793.3, 1784.8, 1791.7, 1800.2, 1788.6, 1775.7, 1753.5, 1768.2, 1727.9, 1709.6, 1704.6, 1740.6, 1745.7, 1751.7, 1747.3, 1757.8, 1774.2, 1774.4, 1788.3, 1788, 1779.1, 1792.8, 1812, 1872.1, 1851.4, 1873.4, 1889.6, 1897.5, 1888.8, 1900.4, 1913.4, 1909.9, 1910.8, 1879.2, 1880.2, 1878.3, 1885.2, 1867.6, 1788, 1820.5, 1858.2, 1870.3, 1878.4, 1881.5, 1893.2, 1889.3, 1877.3, 1884, 1904.7, 1922.7, 1908.5, 1911.4, 1921.1, 1930.8, 1927.8, 1908.3, 1905.9, 1911.1, 1921.6, 1933.6, 1942, 1951.5, 1955.7, 1957.4, 1962.3, 1946.1, 1950.2, 1929.7, 1913.4, 1889.5, 1882.8, 1895.4, 1897.9, 1891.5, 1880.1, 1887, 1891.4, 1914.6, 1931.2, 1929.2, 1924.3, 1927, 1935, 1955.4, 1962.2, 1980.7, 1987.7, 1993.7, 2015.7, 2005, 2023.9, 2028.5, 2044.9, 2045.8, 2057.3, 2061.7, 2061.7, 2061.7, 2092.3, 2090.1, 2105.4, 2105.4, 2105.4, 2117.7, 2128.2, 2124.7, 2079.9, 2074.9 )
numCAC <- c( 1772.8, 1750.5, 1718, 1708.1, 1723.1, 1714.3, 1734.5, 1757.4, 1754, 1754.3, 1759.8, 1755.5, 1758.1, 1757.5, 1763.5, 1762.8, 1768.9, 1778.1, 1780.1, 1767.7, 1757.9, 1756.6, 1754.7, 1766.8, 1766.5, 1762.2, 1759.5, 1782.4, 1789.5, 1783.5, 1780.4, 1808.8, 1820.3, 1820.3, 1820.3, 1687.5, 1725.6, 1792.9, 1819.1, 1833.5, 1853.4, 1849.7, 1851.8, 1857.7, 1864.3, 1863.5, 1873.2, 1860.8, 1868.7, 1860.4, 1855.9, 1840.5, 1842.6, 1861.2, 1876.2, 1878.3, 1878.4, 1869.4, 1880.4, 1885.5, 1888.4, 1885.2, 1877.9, 1876.5, 1883.8, 1880.6, 1887.4, 1878.3, 1867.1, 1851.9, 1843.6, 1848.1, 1843.4, 1843.6, 1833.8, 1833.4, 1856.9, 1863.4, 1855.5, 1864.2, 1846, 1836.8, 1830.4, 1831.6, 1834.8, 1852.1, 1849.8, 1861.8, 1856.7, 1856.7, 1841.5, 1846.9, 1836.1, 1838.6, 1857.6, 1857.6, 1858.4, 1846.8, 1868.5, 1863.2, 1808.3, 1765.1, 1763.5, 1766, 1741.3, 1743.3, 1769, 1757.9, 1754.9, 1739.7, 1708.8, 1722.2, 1713.9, 1703.2, 1685.7, 1663.4, 1636.9, 1645.6, 1671.6, 1688.3, 1696.8, 1711.7, 1706.2, 1684.2, 1648.5, 1633.6, 1699.1, 1699.1, 1722.5, 1720.7, 1741.9, 1765.7, 1765.7, 1749.9, 1770.3, 1787.6, 1778.7, 1785.6, 1833.9, 1837.4, 1824.3, 1843.8, 1873.6, 1860.2, 1860.2, 1865.9, 1867.9, 1841.3, 1838.7, 1849.9, 1869.3, 1890.6, 1879.6, 1873.9, 1875.3, 1857, 1856.5, 1865.8, 1860.6, 1861.6, 1865.6, 1864.1, 1861.6, 1876.5, 1865.1, 1882.1, 1912.2, 1915.4, 1951.2, 1962.4, 1976.5, 1953.5, 1981.3, 1985.1, 1983.4, 1979.7, 1983.8, 1988.1, 1973, 1966.9, 1976.3, 1993.9, 1968, 1941.8, 1947.1, 1929.2, 1943.6, 1928.2, 1922, 1919.1, 1884.6, 1896.3, 1928.3, 1934.8, 1923.5, 1943.8, 1942.4, 1928.1, 1942, 1942.7, 1974.8, 1975.4, 1907.5, 1943.6, 1974.1, 1963.3, 1972.3, 1990.7, 1978.2, 1978.2, 1978.2, 1980.4, 1983.7, 1978.1, 1984.9, 1995.7, 2006.6, 2036.7, 2031.1, 2031.1, 2041.6, 2046.9, 2047.2, 2063.4, 2063.4, 2077.5, 2063.6, 2053.2, 2017, 2024, 2051.6, 2023.1, 2030.8, 2016.8, 2045.1, 2046.3, 2029.6, 2014.1, 2014.1, 2033.3, 2017.4, 2024.9, 1992.6, 1994.9, 1981.6, 1981.6, 1962.2, 1953.7, 1928.8, 1928.3, 1918.1, 1931.4, 1908.8, 1891.8, 1913.9, 1885.8, 1895.8, 1899.6, 1920.3, 1915.3, 1907.3, 1900.6, 1880.9, 1873.5, 1883.6, 1868.5, 1879.1, 1847.8, 1861.8, 1859.4, 1859.4, 1859.4, 1853.3, 1851.2, 1801.8, 1767.9, 1762.7, 1727.5, 1734.6, 1734.6, 1755.4, 1769, 1801.6, 1782.6, 1754.7, 1784.4, 1787.6, 1798, 1793.8, 1777.3, 1755.2, 1737.8, 1730.1, 1722.4, 1753.5, 1757.3, 1736.7, 1734.2, 1724.2, 1744.2, 1689.7, 1667.7, 1667.8, 1687.6, 1687.5, 1684.9, 1674.2, 1711.4, 1780.5, 1779, 1779.3, 1763.7, 1756.8, 1774.2, 1802, 1873.6, 1836.2, 1859.8, 1852.7, 1882.9, 1826.1, 1832.8, 1828.9, 1829.5, 1843.5, 1770.3, 1731.9, 1736.7, 1724, 1683.3, 1611, 1612.5, 1654.2, 1673.9, 1657.3, 1655.1, 1685.1, 1667.9, 1650, 1664.2, 1679.1, 1731.3, 1722.2, 1730.7, 1766.4, 1770.7, 1774.5, 1749.9, 1730.9, 1742.4, 1742.4, 1786.9, 1804.1, 1804.7, 1793.6, 1786.7, 1798.5, 1798.5, 1821.5, 1796.8, 1772.7, 1764.4, 1759.2, 1722.3, 1724.2, 1674.8, 1720.6, 1721, 1739.7, 1749.7, 1771.4, 1792.3, 1783.3, 1799.4, 1781.7, 1788.6, 1765.9, 1791.2, 1769.5, 1758.7, 1738.3, 1744.8, 1736.7, 1735.2, 1760.1, 1786.3, 1824.4, 1821.1, 1854.6, 1854.6, 1857.5, 1870.3, 1858.8, 1857.8, 1857.8, 1843.1, 1850.8, 1859.6, 1844.5, 1852.6 )
numFTSE <- c( 2443.6, 2460.2, 2448.2, 2470.4, 2484.7, 2466.8, 2487.9, 2508.4, 2510.5, 2497.4, 2532.5, 2556.8, 2561, 2547.3, 2541.5, 2558.5, 2587.9, 2580.5, 2579.6, 2589.3, 2595, 2595.6, 2588.8, 2591.7, 2601.7, 2585.4, 2573.3, 2597.4, 2600.6, 2570.6, 2569.4, 2584.9, 2608.8, 2617.2, 2621, 2540.5, 2554.5, 2601.9, 2623, 2640.7, 2640.7, 2619.8, 2624.2, 2638.2, 2645.7, 2679.6, 2669, 2664.6, 2663.3, 2667.4, 2653.2, 2630.8, 2626.6, 2641.9, 2625.8, 2606, 2594.4, 2583.6, 2588.7, 2600.3, 2579.5, 2576.6, 2597.8, 2595.6, 2599, 2621.7, 2645.6, 2644.2, 2625.6, 2624.6, 2596.2, 2599.5, 2584.1, 2570.8, 2555, 2574.5, 2576.7, 2579, 2588.7, 2601.1, 2575.7, 2559.5, 2561.1, 2528.3, 2514.7, 2558.5, 2553.3, 2577.1, 2566, 2549.5, 2527.8, 2540.9, 2534.2, 2538, 2559, 2554.9, 2575.5, 2546.5, 2561.6, 2546.6, 2502.9, 2463.1, 2472.6, 2463.5, 2446.3, 2456.2, 2471.5, 2447.5, 2428.6, 2420.2, 2414.9, 2420.2, 2423.8, 2407, 2388.7, 2409.6, 2392, 2380.2, 2423.3, 2451.6, 2440.8, 2432.9, 2413.6, 2391.6, 2358.1, 2345.4, 2384.4, 2384.4, 2384.4, 2418.7, 2420, 2493.1, 2493.1, 2492.8, 2504.1, 2493.2, 2482.9, 2467.1, 2497.9, 2477.9, 2490.1, 2516.3, 2537.1, 2541.6, 2536.7, 2544.9, 2543.4, 2522, 2525.3, 2510.4, 2539.9, 2552, 2546.5, 2550.8, 2571.2, 2560.2, 2556.8, 2547.1, 2534.3, 2517.2, 2538.4, 2537.1, 2523.7, 2522.6, 2513.9, 2541, 2555.9, 2536.7, 2543.4, 2542.3, 2559.7, 2546.8, 2565, 2562, 2562.1, 2554.3, 2565.4, 2558.4, 2538.3, 2533.1, 2550.7, 2574.8, 2522.4, 2493.3, 2476, 2470.7, 2491.2, 2464.7, 2467.6, 2456.6, 2441, 2458.7, 2464.9, 2472.2, 2447.9, 2452.9, 2440.1, 2408.6, 2405.4, 2382.7, 2400.9, 2404.2, 2393.2, 2436.4, 2572.6, 2591, 2600.5, 2640.2, 2638.6, 2638.6, 2638.6, 2625.8, 2607.8, 2609.8, 2643, 2658.2, 2651, 2664.9, 2654.1, 2659.8, 2659.8, 2662.2, 2698.7, 2701.9, 2725.7, 2737.8, 2722.4, 2720.5, 2694.7, 2682.6, 2703.6, 2700.6, 2711.9, 2702, 2715, 2715, 2704.6, 2698.6, 2694.2, 2707.6, 2697.6, 2705.9, 2680.9, 2681.9, 2668.5, 2645.8, 2635.4, 2636.1, 2614.1, 2603.7, 2593.6, 2616.3, 2598.4, 2562.7, 2584.8, 2550.3, 2560.6, 2532.6, 2557.3, 2534.1, 2515.8, 2521.2, 2493.9, 2476.1, 2497.1, 2469, 2493.7, 2472.6, 2497.9, 2490.8, 2478.3, 2484, 2486.4, 2483.4, 2431.9, 2403.7, 2415.6, 2387.9, 2399.5, 2377.2, 2348, 2373.4, 2423.2, 2411.6, 2399.6, 2420.2, 2407.5, 2392.8, 2377.6, 2350.1, 2325.7, 2309.6, 2303.1, 2318, 2356.8, 2376.1, 2354.7, 2363.5, 2359.4, 2365.7, 2311.1, 2281, 2285, 2311.6, 2312.6, 2312.6, 2298.4, 2313, 2381.9, 2362.2, 2372.2, 2337.7, 2327.5, 2340.6, 2370.9, 2422.1, 2370, 2378.3, 2483.9, 2567, 2560.1, 2586, 2580.5, 2621.2, 2601, 2560, 2565.5, 2553, 2572.3, 2549.7, 2446.3, 2488.4, 2517.1, 2538.8, 2541.2, 2557.2, 2584.7, 2574.7, 2546.6, 2563.9, 2562.2, 2617, 2645.7, 2658.1, 2669.7, 2661.6, 2669.8, 2650.4, 2642.3, 2658.3, 2687.8, 2705.6, 2691.7, 2711.1, 2702.7, 2695.4, 2714.6, 2696.8, 2726.4, 2697.5, 2679.6, 2679.2, 2704, 2706.2, 2732.4, 2722.9, 2727.1, 2709.6, 2741.8, 2760.1, 2778.8, 2792, 2764.1, 2771, 2759.4, 2754.5, 2769.8, 2750.7, 2726.5, 2716.2, 2721.8, 2717.9, 2732.8, 2740.3, 2789.7, 2807.7, 2842, 2827.4, 2827.5, 2827.5, 2827.5, 2847.8, 2832.5, 2846.5, 2846.5, 2861.5, 2833.6, 2826, 2816.5, 2799.2 )
mtxEU <- matrix(data=c(numDAX, numSMI, numCAC, numFTSE), ncol=4, byrow=FALSE)
colnames(mtxEU) <- c("DAX", "SMI", "CAC", "FTSE")
eu_stocks <- ts(data=mtxEU, start=c(1991, 130), frequency=260)
str(eu_stocks)
## mts [1:400, 1:4] 1629 1614 1606 1621 1618 ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr [1:4] "DAX" "SMI" "CAC" "FTSE"
## - attr(*, "tsp")= num [1:3] 1991 1993 260
## - attr(*, "class")= chr [1:3] "mts" "ts" "matrix"
# Check whether eu_stocks is a ts object
is.ts(eu_stocks)
## [1] TRUE
# View the start, end, and frequency of eu_stocks
start(eu_stocks)
## [1] 1991 130
end(eu_stocks)
## [1] 1993 9
frequency(eu_stocks)
## [1] 260
# Generate a simple plot of eu_stocks
plot(eu_stocks)
# Use ts.plot with eu_stocks
ts.plot(eu_stocks, col = 1:4, xlab = "Year", ylab = "Index Value",
main = "Major European Stock Indices, 1991-1998"
)
# Add a legend to your ts.plot
legend("topleft", colnames(eu_stocks), lty = 1, col = 1:4, bty = "n")
Chapter 2 - Predicting the Future
Trend spotting - clear trends over time - many time series have some trends to the data:
White Noise (WN) model - simplest example of a stationary process (fixed constant mean, fixed constant variance, no correlation over time):
Random Walk (RW) model - simple example of a non-stationary process with no specified mean or variance, but with strong dependence over time:
Stationary Process - assumptions of stationary models help with parsimony and distributional stability:
Example code includes:
rapid_growth <- c( 506 , 447.4 , 542.6 , 516.1 , 507 , 535 , 496.9 , 497.6 , 577.2 , 536.9 , 541.2 , 473.5 , 551 , 569.4 , 522.9 , 487.2 , 594.6 , 591.2 , 616 , 621.3 , 607.1 , 587 , 554.2 , 644.1 , 509.7 , 607.1 , 603.6 , 613.6 , 544.9 , 670.8 , 687.1 , 615.6 , 711.2 , 694.3 , 681.9 , 659.1 , 642.7 , 601.5 , 666.8 , 651 , 606.1 , 696.7 , 641.6 , 855.8 , 667.3 , 573.5 , 791.7 , 751.6 , 610.8 , 624.7 , 833.3 , 639.9 , 736.8 , 772.3 , 686.9 , 667.8 , 712.9 , 918.2 , 656.1 , 700.5 , 683.5 , 781.7 , 715.7 , 808.3 , 820.8 , 656.9 , 733.3 , 773.5 , 641.2 , 932.2 , 680.7 , 988.3 , 664.9 , 813.5 , 883.4 , 924.3 , 969.4 , 777.3 , 881 , 971.4 , 903 , 1020.7 , 1075.1 , 886.2 , 889.6 , 950.4 , 878 , 1043.8 , 901.1 , 1079.7 , 933.9 , 921.9 , 870.8 , 811.1 , 1004.3 , 1008.2 , 1189.5 , 752 , 947.5 , 886.5 , 1074.9 , 1101.1 , 1130.2 , 975.8 , 948.2 , 1177.8 , 1227.1 , 977 , 836.7 , 1323.6 , 852.4 , 1200.8 , 1274.5 , 1349.3 , 1102.6 , 1324.9 , 1268.7 , 1058.2 , 1204.1 , 1084.7 , 1284.4 , 1195.3 , 1058.4 , 1188.1 , 1166.6 , 1064.7 , 1429.1 , 1070.9 , 1539.3 , 1467.2 , 1127.7 , 1296.1 , 1555.3 , 1332.9 , 1315.4 , 1189.2 , 1482.4 , 1240.9 , 1237.8 , 1468.6 , 1328.5 , 1589.5 , 1373.2 , 1503.6 , 1659.9 , 1704.6 , 1550.5 , 1625.8 , 1873.9 , 1370.6 , 1439.7 , 1447.4 , 1579.9 , 1681.3 , 1661.6 , 1311.8 , 1326 , 1323.1 , 1550.5 , 1606.2 , 1768.5 , 1509.8 , 1592.1 , 1627.6 , 1544.6 , 1439.5 , 1682.4 , 1850.7 , 1673.4 , 1832.4 , 1672.3 , 1781.6 , 1659.3 , 1970 , 2044.7 , 1929.1 , 1891.7 , 1487.2 , 2013.9 , 1796.8 , 1977 , 1517 , 1650.6 , 1523.3 , 1696.6 , 1627.3 , 1787.3 , 1567.3 , 1882 , 2319 , 1942 , 1820.3 , 2154.8 , 2261.5 , 2052.2 , 2079.2 , 2010.1 , 2145.3 , 1775.3 , 2013.4 )
# Log rapid_growth
linear_growth <- log(rapid_growth)
# Plot linear_growth using ts.plot()
ts.plot(linear_growth)
z <- c( 6.23 , 6.1 , 6.3 , 6.25 , 6.23 , 6.28 , 6.21 , 6.21 , 6.36 , 6.29 , 6.29 , 6.16 , 6.31 , 6.34 , 6.26 , 6.19 , 6.39 , 6.38 , 6.42 , 6.43 , 6.41 , 6.38 , 6.32 , 6.47 , 6.23 , 6.41 , 6.4 , 6.42 , 6.3 , 6.51 , 6.53 , 6.42 , 6.57 , 6.54 , 6.52 , 6.49 , 6.47 , 6.4 , 6.5 , 6.48 , 6.41 , 6.55 , 6.46 , 6.75 , 6.5 , 6.35 , 6.67 , 6.62 , 6.41 , 6.44 , 6.73 , 6.46 , 6.6 , 6.65 , 6.53 , 6.5 , 6.57 , 6.82 , 6.49 , 6.55 , 6.53 , 6.66 , 6.57 , 6.69 , 6.71 , 6.49 , 6.6 , 6.65 , 6.46 , 6.84 , 6.52 , 6.9 , 6.5 , 6.7 , 6.78 , 6.83 , 6.88 , 6.66 , 6.78 , 6.88 , 6.81 , 6.93 , 6.98 , 6.79 , 6.79 , 6.86 , 6.78 , 6.95 , 6.8 , 6.98 , 6.84 , 6.83 , 6.77 , 6.7 , 6.91 , 6.92 , 7.08 , 6.62 , 6.85 , 6.79 , 6.98 , 7 , 7.03 , 6.88 , 6.85 , 7.07 , 7.11 , 6.88 , 6.73 , 7.19 , 6.75 , 7.09 , 7.15 , 7.21 , 7.01 , 7.19 , 7.15 , 6.96 , 7.09 , 6.99 , 7.16 , 7.09 , 6.96 , 7.08 , 7.06 , 6.97 , 7.26 , 6.98 , 7.34 , 7.29 , 7.03 , 7.17 , 7.35 , 7.2 , 7.18 , 7.08 , 7.3 , 7.12 , 7.12 , 7.29 , 7.19 , 7.37 , 7.22 , 7.32 , 7.41 , 7.44 , 7.35 , 7.39 , 7.54 , 7.22 , 7.27 , 7.28 , 7.37 , 7.43 , 7.42 , 7.18 , 7.19 , 7.19 , 7.35 , 7.38 , 7.48 , 7.32 , 7.37 , 7.39 , 7.34 , 7.27 , 7.43 , 7.52 , 7.42 , 7.51 , 7.42 , 7.49 , 7.41 , 7.59 , 7.62 , 7.56 , 7.55 , 7.3 , 7.61 , 7.49 , 7.59 , 7.32 , 7.41 , 7.33 , 7.44 , 7.39 , 7.49 , 7.36 , 7.54 , 7.75 , 7.57 , 7.51 , 7.68 , 7.72 , 7.63 , 7.64 , 7.61 , 7.67 , 7.48 , 7.61 )
# Generate the first difference of z
dz <- diff(z)
# Plot dz
ts.plot(dz)
# View the length of z and dz, respectively
length(z)
## [1] 200
length(dz)
## [1] 199
x <- c( -4.2 , 9.57 , 5.18 , -9.69 , -3.22 , 10.84 , 6.45 , -10.83 , -2.24 , 10.12 , 6.58 , -8.66 , -2.52 , 9.84 , 7.39 , -8.24 , -4.26 , 8.9 , 8.54 , -8.07 , -4.02 , 9.82 , 7.77 , -6.59 , -3.46 , 10.61 , 7.37 , -5.8 , -1.2 , 11.43 , 7.57 , -4.97 , -2 , 11.94 , 9.41 , -4.4 , -1.56 , 12.6 , 8.5 , -3.73 , -2.83 , 13.38 , 8.13 , -3.15 , -2.8 , 13.71 , 6.76 , -3.78 , -3.77 , 13.63 , 6.54 , -3.25 , -5.02 , 13.36 , 6.93 , -3.53 , -5.2 , 11.58 , 7.16 , -1.89 , -5.78 , 12.48 , 6.21 , -3.43 , -7.08 , 11.41 , 6.74 , -3.53 , -8.39 , 12.51 , 6.47 , -3.75 , -9.43 , 12.38 , 8.05 , -2.83 , -7.3 , 12.77 , 8.22 , -4.45 , -6.96 , 12.03 , 7.57 , -5.4 , -6.57 , 10.9 , 7.28 , -4.04 , -6.72 , 12.18 , 8.29 , -4.16 , -6.36 , 12.75 , 8.67 , -5.44 , -4.87 , 12.6 , 8.16 , -6.54 )
# Generate a diff of x with lag = 4. Save this to dx
dx <- diff(x, lag=4)
# Plot dx
ts.plot(dx)
# View the length of x and dx, respectively
length(x)
## [1] 100
length(dx)
## [1] 96
# Simulate a WN model with list(order = c(0, 0, 0))
white_noise <- arima.sim(model = list(order=c(0, 0, 0)), n = 100)
# Plot your white_noise data
ts.plot(white_noise)
# Simulate from the WN model with: mean = 100, sd = 10
white_noise_2 <- arima.sim(model = list(order=c(0, 0, 0)), n = 100, mean = 100, sd = 10)
# Plot your white_noise_2 data
ts.plot(white_noise_2)
# Fit the WN model to y using the arima command
arima(white_noise_2, order=c(0, 0, 0))
##
## Call:
## arima(x = white_noise_2, order = c(0, 0, 0))
##
## Coefficients:
## intercept
## 102.5077
## s.e. 0.9957
##
## sigma^2 estimated as 99.14: log likelihood = -371.72, aic = 747.44
# Calculate the sample mean and sample variance of y
mean(white_noise_2)
## [1] 102.5077
var(white_noise_2)
## [1] 100.1436
# Generate a RW model using arima.sim
random_walk <- arima.sim(model = list(order=c(0, 1, 0)), n = 100)
# Plot random_walk
ts.plot(random_walk)
# Calculate the first difference series
random_walk_diff <- diff(random_walk)
# Plot random_walk_diff
ts.plot(random_walk_diff)
# Generate a RW model with a drift uing arima.sim
rw_drift <- arima.sim(model = list(order=c(0, 1, 0)), n = 100, mean = 1)
# Plot rw_drift
ts.plot(rw_drift)
# Calculate the first difference series
rw_drift_diff <- diff(rw_drift)
# Plot rw_drift_diff
ts.plot(rw_drift_diff)
# Difference your random_walk data
rw_diff <- diff(random_walk)
# Plot rw_diff
ts.plot(rw_diff)
# Now fit the WN model to the differenced data
model_wn <-arima(rw_diff, order=c(0, 0, 0))
# Store the value of the estimated time trend (intercept)
int_wn <- model_wn$coef
# Plot the original random_walk data
ts.plot(random_walk)
# Use abline(0, ...) to add time trend to the figure
abline(0, int_wn)
# Use arima.sim() to generate WN data
white_noise <- arima.sim(model=list(order=c(0, 0, 0)), n=100)
# Use cumsum() to convert your WN data to RW
random_walk <- cumsum(white_noise)
# Use arima.sim() to generate WN drift data
wn_drift <- arima.sim(model=list(order=c(0, 0, 0)), n=100, mean=0.4)
# Use cumsum() to convert your WN drift data to RW
rw_drift <- cumsum(wn_drift)
# Plot all four data objects
plot.ts(cbind(white_noise, random_walk, wn_drift, rw_drift))
Chapter 3 - Correlation Analysis
Scatterplots can be created using ts.plot, including ts.plot(cbind(a, b, .)) to have multiple plots on the same scale:
Covariance and Correlation - running cov(a, b) and cor(a, b):
Autocorrelation - how strongly is each observation related to its recent past?
Example code includes:
# Make a dummy eu_stocks, but shorter than the actual 1860x4
numDAX <- c( 1628.8, 1613.6, 1606.5, 1621, 1618.2, 1610.6, 1630.8, 1640.2, 1635.5, 1645.9, 1647.8, 1638.3, 1629.9, 1621.5, 1624.7, 1627.6, 1632, 1621.2, 1613.4, 1605, 1605.8, 1616.7, 1619.3, 1620.5, 1619.7, 1623.1, 1614, 1631.9, 1630.4, 1633.5, 1626.5, 1650.4, 1650.1, 1654.1, 1653.6, 1501.8, 1524.3, 1603.7, 1622.5, 1636.7, 1652.1, 1645.8, 1650.4, 1651.5, 1649.9, 1653.5, 1657.5, 1649.5, 1649.1, 1646.4, 1638.7, 1625.8, 1628.6, 1632.2, 1633.7, 1631.2, 1635.8, 1621.3, 1624.7, 1616.1, 1618.1, 1627.8, 1625.8, 1614.8, 1612.8, 1605.5, 1609.3, 1607.5, 1607.5, 1604.9, 1589.1, 1582.3, 1568, 1568.2, 1569.7, 1571.7, 1585.4, 1570, 1561.9, 1565.2, 1570.3, 1577, 1590.3, 1572.7, 1572.1, 1579.2, 1588.7, 1586, 1579.8, 1572.6, 1568.1, 1578.2, 1573.9, 1582.1, 1610.2, 1605.2, 1623.8, 1615.3, 1627.1, 1627, 1605.7, 1589.7, 1589.7, 1603.3, 1599.8, 1590.9, 1603.5, 1589.9, 1587.9, 1571.1, 1549.8, 1549.4, 1554.7, 1557.5, 1555.3, 1559.8, 1548.4, 1544, 1550.2, 1557, 1551.8, 1562.9, 1570.3, 1559.3, 1545.9, 1542.8, 1542.8, 1542.8, 1542.8, 1564.3, 1577.3, 1577.3, 1577.3, 1598.2, 1604, 1604.7, 1593.7, 1581.7, 1599.1, 1613.8, 1620.5, 1629.5, 1663.7, 1664.1, 1669.3, 1685.1, 1687.1, 1680.1, 1671.8, 1669.5, 1686.7, 1685.5, 1671, 1683.1, 1685.7, 1685.7, 1678.8, 1685.8, 1683.7, 1686.6, 1683.7, 1679.1, 1685, 1680.8, 1676.2, 1688.5, 1696.5, 1690.2, 1711.3, 1711.3, 1729.9, 1716.6, 1743.4, 1745.2, 1746.8, 1749.3, 1763.9, 1762.3, 1762.3, 1746.8, 1753.5, 1753.2, 1739.9, 1723.9, 1734.4, 1723.1, 1732.9, 1729.9, 1725.7, 1730.9, 1714.2, 1716.2, 1719.1, 1718.2, 1698.8, 1714.8, 1718.3, 1706.7, 1723.4, 1716.2, 1738.8, 1737.4, 1714.8, 1724.2, 1733.8, 1730, 1734.5, 1744.3, 1746.9, 1746.9, 1746.9, 1747.5, 1753.1, 1745.2, 1745.7, 1742.9, 1731.7, 1731.2, 1728.1, 1728.1, 1731.3, 1733.8, 1745.8, 1752.6, 1748.1, 1750.7, 1747.9, 1745.8, 1735.3, 1719.9, 1763.6, 1766.8, 1785.4, 1783.6, 1804.4, 1812.3, 1799.5, 1792.8, 1792.8, 1806.4, 1798.2, 1800.6, 1786.2, 1791.3, 1789, 1789, 1784.7, 1789.5, 1779.7, 1787, 1773.2, 1781.6, 1773.8, 1773.8, 1776.3, 1770.7, 1772.4, 1762.5, 1764.3, 1752.8, 1756, 1755, 1759.9, 1759.8, 1776.5, 1770, 1767, 1752.3, 1760.2, 1750.3, 1731.4, 1735.5, 1733.8, 1730.8, 1699.5, 1652.7, 1654.1, 1636.8, 1622.8, 1613.4, 1617.8, 1617.2, 1637.6, 1622.2, 1608.5, 1605.1, 1609.6, 1624.9, 1618.1, 1612, 1579, 1561.4, 1547.9, 1548.6, 1560.2, 1554.8, 1531.9, 1526.1, 1509, 1530, 1485, 1464, 1475.1, 1516.1, 1519.7, 1530, 1516.4, 1515.5, 1543.9, 1534.7, 1538.7, 1536.7, 1523.8, 1527.1, 1530.2, 1601.5, 1580.3, 1595.1, 1579.5, 1600.6, 1566, 1557, 1542.7, 1536.3, 1510.7, 1481, 1483.8, 1470.1, 1484.8, 1475.4, 1402.3, 1421.5, 1434.6, 1446.3, 1437.7, 1441.6, 1471.6, 1454, 1453.8, 1458, 1479.6, 1504.9, 1496.5, 1511, 1528.9, 1534, 1536.6, 1508.2, 1493.5, 1489.7, 1482.4, 1483.3, 1470.6, 1484.8, 1487.7, 1508.6, 1515.3, 1509.8, 1542.3, 1541.8, 1542.5, 1550.3, 1550.3, 1543.4, 1547.8, 1523.6, 1526.7, 1513.4, 1523, 1529.7, 1545.1, 1546.8, 1528.1, 1530.7, 1526.2, 1519.5, 1506.7, 1504.3, 1480.7, 1476.7, 1478.1, 1479.6, 1477.5, 1472.6, 1495.6, 1517.5, 1520.9, 1527.1, 1527.1, 1527.1, 1547.5, 1545.8, 1538.4, 1538.4, 1538.4, 1538, 1554, 1551.2, 1538.4, 1529.1 )
numSMI <- c( 1678.1, 1688.5, 1678.6, 1684.1, 1686.6, 1671.6, 1682.9, 1703.6, 1697.5, 1716.3, 1723.8, 1730.5, 1727.4, 1733.3, 1734, 1728.3, 1737.1, 1723.1, 1723.6, 1719, 1721.2, 1725.3, 1727.2, 1727.2, 1731.6, 1724.1, 1716.9, 1723.4, 1723, 1728.4, 1722.1, 1724.5, 1733.6, 1739, 1726.2, 1587.4, 1630.6, 1685.5, 1701.3, 1718, 1726.2, 1716.6, 1725.8, 1737.4, 1736.6, 1732.4, 1731.2, 1726.9, 1727.8, 1720.2, 1715.4, 1708.7, 1713, 1713.5, 1718, 1701.7, 1701.7, 1684.9, 1687.2, 1690.6, 1684.3, 1679.9, 1672.9, 1663.1, 1669.3, 1664.7, 1672.3, 1687.7, 1686.8, 1686.6, 1675.8, 1677.4, 1673.2, 1665, 1671.3, 1672.4, 1676.2, 1692.6, 1696.5, 1716.1, 1713.3, 1705.1, 1711.3, 1709.8, 1688.6, 1698.9, 1700, 1693, 1683.9, 1679.2, 1673.9, 1683.9, 1688.4, 1693.9, 1720.9, 1717.9, 1733.6, 1729.7, 1735.6, 1734.1, 1699.3, 1678.6, 1675.5, 1670.1, 1652.2, 1635, 1654.9, 1642, 1638.7, 1622.6, 1596.1, 1612.4, 1625, 1610.5, 1606.6, 1610.7, 1603.1, 1591.5, 1605.2, 1621.4, 1622.5, 1626.6, 1627.4, 1614.9, 1602.3, 1598.3, 1627, 1627, 1627, 1655.7, 1670.1, 1670.1, 1670.1, 1670.1, 1704, 1711.8, 1700.5, 1690.3, 1715.4, 1723.5, 1719.4, 1734.4, 1772.8, 1760.3, 1747.2, 1750.2, 1755.3, 1754.6, 1751.2, 1752.5, 1769.4, 1767.6, 1750, 1747.1, 1753.5, 1752.8, 1752.9, 1764.7, 1776.8, 1779.3, 1785.1, 1798.2, 1794.1, 1795.2, 1780.4, 1789.5, 1794.2, 1784.4, 1800.1, 1804, 1816.2, 1810.5, 1821.9, 1828.2, 1840.6, 1841.1, 1846.3, 1850, 1839, 1820.2, 1815.2, 1820.6, 1807.1, 1791.4, 1806.2, 1798.7, 1818.2, 1820.5, 1833.3, 1837.1, 1818.2, 1824.1, 1830.1, 1835.6, 1828.7, 1839.2, 1837.2, 1826.7, 1838, 1829.1, 1843.1, 1850.5, 1827.1, 1829.1, 1848, 1840.5, 1853.8, 1874.1, 1871.3, 1871.3, 1871.3, 1860.5, 1874.7, 1880.1, 1874.7, 1875.6, 1859.5, 1874.2, 1880.1, 1880.1, 1907.7, 1920.5, 1937.3, 1936.8, 1949.1, 1963.7, 1950.8, 1953.5, 1945, 1921.1, 1939.1, 1928, 1933.4, 1925.7, 1931.7, 1928.7, 1924.5, 1914.2, 1914.2, 1920.6, 1923.3, 1930.4, 1915.2, 1916.9, 1913.8, 1913.8, 1899.7, 1888, 1868.8, 1879.9, 1865.7, 1881.3, 1873.1, 1862.5, 1869.3, 1846.9, 1847.1, 1838.3, 1845.8, 1835.5, 1846.6, 1854.8, 1845.3, 1854.5, 1870.5, 1862.6, 1856.6, 1837.6, 1846.7, 1856.5, 1841.8, 1835, 1844.4, 1838.9, 1805.6, 1756.6, 1786.1, 1757.1, 1762.8, 1756.8, 1761.9, 1778.5, 1812.7, 1806.1, 1798.1, 1794.9, 1805.4, 1820.3, 1819.6, 1809.6, 1799.9, 1800.3, 1793.3, 1784.8, 1791.7, 1800.2, 1788.6, 1775.7, 1753.5, 1768.2, 1727.9, 1709.6, 1704.6, 1740.6, 1745.7, 1751.7, 1747.3, 1757.8, 1774.2, 1774.4, 1788.3, 1788, 1779.1, 1792.8, 1812, 1872.1, 1851.4, 1873.4, 1889.6, 1897.5, 1888.8, 1900.4, 1913.4, 1909.9, 1910.8, 1879.2, 1880.2, 1878.3, 1885.2, 1867.6, 1788, 1820.5, 1858.2, 1870.3, 1878.4, 1881.5, 1893.2, 1889.3, 1877.3, 1884, 1904.7, 1922.7, 1908.5, 1911.4, 1921.1, 1930.8, 1927.8, 1908.3, 1905.9, 1911.1, 1921.6, 1933.6, 1942, 1951.5, 1955.7, 1957.4, 1962.3, 1946.1, 1950.2, 1929.7, 1913.4, 1889.5, 1882.8, 1895.4, 1897.9, 1891.5, 1880.1, 1887, 1891.4, 1914.6, 1931.2, 1929.2, 1924.3, 1927, 1935, 1955.4, 1962.2, 1980.7, 1987.7, 1993.7, 2015.7, 2005, 2023.9, 2028.5, 2044.9, 2045.8, 2057.3, 2061.7, 2061.7, 2061.7, 2092.3, 2090.1, 2105.4, 2105.4, 2105.4, 2117.7, 2128.2, 2124.7, 2079.9, 2074.9 )
numCAC <- c( 1772.8, 1750.5, 1718, 1708.1, 1723.1, 1714.3, 1734.5, 1757.4, 1754, 1754.3, 1759.8, 1755.5, 1758.1, 1757.5, 1763.5, 1762.8, 1768.9, 1778.1, 1780.1, 1767.7, 1757.9, 1756.6, 1754.7, 1766.8, 1766.5, 1762.2, 1759.5, 1782.4, 1789.5, 1783.5, 1780.4, 1808.8, 1820.3, 1820.3, 1820.3, 1687.5, 1725.6, 1792.9, 1819.1, 1833.5, 1853.4, 1849.7, 1851.8, 1857.7, 1864.3, 1863.5, 1873.2, 1860.8, 1868.7, 1860.4, 1855.9, 1840.5, 1842.6, 1861.2, 1876.2, 1878.3, 1878.4, 1869.4, 1880.4, 1885.5, 1888.4, 1885.2, 1877.9, 1876.5, 1883.8, 1880.6, 1887.4, 1878.3, 1867.1, 1851.9, 1843.6, 1848.1, 1843.4, 1843.6, 1833.8, 1833.4, 1856.9, 1863.4, 1855.5, 1864.2, 1846, 1836.8, 1830.4, 1831.6, 1834.8, 1852.1, 1849.8, 1861.8, 1856.7, 1856.7, 1841.5, 1846.9, 1836.1, 1838.6, 1857.6, 1857.6, 1858.4, 1846.8, 1868.5, 1863.2, 1808.3, 1765.1, 1763.5, 1766, 1741.3, 1743.3, 1769, 1757.9, 1754.9, 1739.7, 1708.8, 1722.2, 1713.9, 1703.2, 1685.7, 1663.4, 1636.9, 1645.6, 1671.6, 1688.3, 1696.8, 1711.7, 1706.2, 1684.2, 1648.5, 1633.6, 1699.1, 1699.1, 1722.5, 1720.7, 1741.9, 1765.7, 1765.7, 1749.9, 1770.3, 1787.6, 1778.7, 1785.6, 1833.9, 1837.4, 1824.3, 1843.8, 1873.6, 1860.2, 1860.2, 1865.9, 1867.9, 1841.3, 1838.7, 1849.9, 1869.3, 1890.6, 1879.6, 1873.9, 1875.3, 1857, 1856.5, 1865.8, 1860.6, 1861.6, 1865.6, 1864.1, 1861.6, 1876.5, 1865.1, 1882.1, 1912.2, 1915.4, 1951.2, 1962.4, 1976.5, 1953.5, 1981.3, 1985.1, 1983.4, 1979.7, 1983.8, 1988.1, 1973, 1966.9, 1976.3, 1993.9, 1968, 1941.8, 1947.1, 1929.2, 1943.6, 1928.2, 1922, 1919.1, 1884.6, 1896.3, 1928.3, 1934.8, 1923.5, 1943.8, 1942.4, 1928.1, 1942, 1942.7, 1974.8, 1975.4, 1907.5, 1943.6, 1974.1, 1963.3, 1972.3, 1990.7, 1978.2, 1978.2, 1978.2, 1980.4, 1983.7, 1978.1, 1984.9, 1995.7, 2006.6, 2036.7, 2031.1, 2031.1, 2041.6, 2046.9, 2047.2, 2063.4, 2063.4, 2077.5, 2063.6, 2053.2, 2017, 2024, 2051.6, 2023.1, 2030.8, 2016.8, 2045.1, 2046.3, 2029.6, 2014.1, 2014.1, 2033.3, 2017.4, 2024.9, 1992.6, 1994.9, 1981.6, 1981.6, 1962.2, 1953.7, 1928.8, 1928.3, 1918.1, 1931.4, 1908.8, 1891.8, 1913.9, 1885.8, 1895.8, 1899.6, 1920.3, 1915.3, 1907.3, 1900.6, 1880.9, 1873.5, 1883.6, 1868.5, 1879.1, 1847.8, 1861.8, 1859.4, 1859.4, 1859.4, 1853.3, 1851.2, 1801.8, 1767.9, 1762.7, 1727.5, 1734.6, 1734.6, 1755.4, 1769, 1801.6, 1782.6, 1754.7, 1784.4, 1787.6, 1798, 1793.8, 1777.3, 1755.2, 1737.8, 1730.1, 1722.4, 1753.5, 1757.3, 1736.7, 1734.2, 1724.2, 1744.2, 1689.7, 1667.7, 1667.8, 1687.6, 1687.5, 1684.9, 1674.2, 1711.4, 1780.5, 1779, 1779.3, 1763.7, 1756.8, 1774.2, 1802, 1873.6, 1836.2, 1859.8, 1852.7, 1882.9, 1826.1, 1832.8, 1828.9, 1829.5, 1843.5, 1770.3, 1731.9, 1736.7, 1724, 1683.3, 1611, 1612.5, 1654.2, 1673.9, 1657.3, 1655.1, 1685.1, 1667.9, 1650, 1664.2, 1679.1, 1731.3, 1722.2, 1730.7, 1766.4, 1770.7, 1774.5, 1749.9, 1730.9, 1742.4, 1742.4, 1786.9, 1804.1, 1804.7, 1793.6, 1786.7, 1798.5, 1798.5, 1821.5, 1796.8, 1772.7, 1764.4, 1759.2, 1722.3, 1724.2, 1674.8, 1720.6, 1721, 1739.7, 1749.7, 1771.4, 1792.3, 1783.3, 1799.4, 1781.7, 1788.6, 1765.9, 1791.2, 1769.5, 1758.7, 1738.3, 1744.8, 1736.7, 1735.2, 1760.1, 1786.3, 1824.4, 1821.1, 1854.6, 1854.6, 1857.5, 1870.3, 1858.8, 1857.8, 1857.8, 1843.1, 1850.8, 1859.6, 1844.5, 1852.6 )
numFTSE <- c( 2443.6, 2460.2, 2448.2, 2470.4, 2484.7, 2466.8, 2487.9, 2508.4, 2510.5, 2497.4, 2532.5, 2556.8, 2561, 2547.3, 2541.5, 2558.5, 2587.9, 2580.5, 2579.6, 2589.3, 2595, 2595.6, 2588.8, 2591.7, 2601.7, 2585.4, 2573.3, 2597.4, 2600.6, 2570.6, 2569.4, 2584.9, 2608.8, 2617.2, 2621, 2540.5, 2554.5, 2601.9, 2623, 2640.7, 2640.7, 2619.8, 2624.2, 2638.2, 2645.7, 2679.6, 2669, 2664.6, 2663.3, 2667.4, 2653.2, 2630.8, 2626.6, 2641.9, 2625.8, 2606, 2594.4, 2583.6, 2588.7, 2600.3, 2579.5, 2576.6, 2597.8, 2595.6, 2599, 2621.7, 2645.6, 2644.2, 2625.6, 2624.6, 2596.2, 2599.5, 2584.1, 2570.8, 2555, 2574.5, 2576.7, 2579, 2588.7, 2601.1, 2575.7, 2559.5, 2561.1, 2528.3, 2514.7, 2558.5, 2553.3, 2577.1, 2566, 2549.5, 2527.8, 2540.9, 2534.2, 2538, 2559, 2554.9, 2575.5, 2546.5, 2561.6, 2546.6, 2502.9, 2463.1, 2472.6, 2463.5, 2446.3, 2456.2, 2471.5, 2447.5, 2428.6, 2420.2, 2414.9, 2420.2, 2423.8, 2407, 2388.7, 2409.6, 2392, 2380.2, 2423.3, 2451.6, 2440.8, 2432.9, 2413.6, 2391.6, 2358.1, 2345.4, 2384.4, 2384.4, 2384.4, 2418.7, 2420, 2493.1, 2493.1, 2492.8, 2504.1, 2493.2, 2482.9, 2467.1, 2497.9, 2477.9, 2490.1, 2516.3, 2537.1, 2541.6, 2536.7, 2544.9, 2543.4, 2522, 2525.3, 2510.4, 2539.9, 2552, 2546.5, 2550.8, 2571.2, 2560.2, 2556.8, 2547.1, 2534.3, 2517.2, 2538.4, 2537.1, 2523.7, 2522.6, 2513.9, 2541, 2555.9, 2536.7, 2543.4, 2542.3, 2559.7, 2546.8, 2565, 2562, 2562.1, 2554.3, 2565.4, 2558.4, 2538.3, 2533.1, 2550.7, 2574.8, 2522.4, 2493.3, 2476, 2470.7, 2491.2, 2464.7, 2467.6, 2456.6, 2441, 2458.7, 2464.9, 2472.2, 2447.9, 2452.9, 2440.1, 2408.6, 2405.4, 2382.7, 2400.9, 2404.2, 2393.2, 2436.4, 2572.6, 2591, 2600.5, 2640.2, 2638.6, 2638.6, 2638.6, 2625.8, 2607.8, 2609.8, 2643, 2658.2, 2651, 2664.9, 2654.1, 2659.8, 2659.8, 2662.2, 2698.7, 2701.9, 2725.7, 2737.8, 2722.4, 2720.5, 2694.7, 2682.6, 2703.6, 2700.6, 2711.9, 2702, 2715, 2715, 2704.6, 2698.6, 2694.2, 2707.6, 2697.6, 2705.9, 2680.9, 2681.9, 2668.5, 2645.8, 2635.4, 2636.1, 2614.1, 2603.7, 2593.6, 2616.3, 2598.4, 2562.7, 2584.8, 2550.3, 2560.6, 2532.6, 2557.3, 2534.1, 2515.8, 2521.2, 2493.9, 2476.1, 2497.1, 2469, 2493.7, 2472.6, 2497.9, 2490.8, 2478.3, 2484, 2486.4, 2483.4, 2431.9, 2403.7, 2415.6, 2387.9, 2399.5, 2377.2, 2348, 2373.4, 2423.2, 2411.6, 2399.6, 2420.2, 2407.5, 2392.8, 2377.6, 2350.1, 2325.7, 2309.6, 2303.1, 2318, 2356.8, 2376.1, 2354.7, 2363.5, 2359.4, 2365.7, 2311.1, 2281, 2285, 2311.6, 2312.6, 2312.6, 2298.4, 2313, 2381.9, 2362.2, 2372.2, 2337.7, 2327.5, 2340.6, 2370.9, 2422.1, 2370, 2378.3, 2483.9, 2567, 2560.1, 2586, 2580.5, 2621.2, 2601, 2560, 2565.5, 2553, 2572.3, 2549.7, 2446.3, 2488.4, 2517.1, 2538.8, 2541.2, 2557.2, 2584.7, 2574.7, 2546.6, 2563.9, 2562.2, 2617, 2645.7, 2658.1, 2669.7, 2661.6, 2669.8, 2650.4, 2642.3, 2658.3, 2687.8, 2705.6, 2691.7, 2711.1, 2702.7, 2695.4, 2714.6, 2696.8, 2726.4, 2697.5, 2679.6, 2679.2, 2704, 2706.2, 2732.4, 2722.9, 2727.1, 2709.6, 2741.8, 2760.1, 2778.8, 2792, 2764.1, 2771, 2759.4, 2754.5, 2769.8, 2750.7, 2726.5, 2716.2, 2721.8, 2717.9, 2732.8, 2740.3, 2789.7, 2807.7, 2842, 2827.4, 2827.5, 2827.5, 2827.5, 2847.8, 2832.5, 2846.5, 2846.5, 2861.5, 2833.6, 2826, 2816.5, 2799.2 )
mtxEU <- matrix(data=c(numDAX, numSMI, numCAC, numFTSE), ncol=4, byrow=FALSE)
colnames(mtxEU) <- c("DAX", "SMI", "CAC", "FTSE")
eu_stocks <- ts(data=mtxEU, start=c(1991, 130), frequency=260)
# Plot eu_stocks
plot(eu_stocks)
# Use this code to convert prices to returns
returns <- eu_stocks[-1,] / eu_stocks[-nrow(eu_stocks),] - 1
# Convert returns to ts
returns <- ts(returns, start = c(1991, 130), frequency = 260)
# Plot returns
plot(returns)
# Use this code to convert prices to log returns
logreturns <- diff(log(eu_stocks))
# Plot logreturns
plot(logreturns)
# Create eu_percentreturns
eu_percentreturns <- ts(data=100 * (eu_stocks[-1,] / eu_stocks[-nrow(eu_stocks),] - 1),
start=c(1991, 130), frequency=260
)
str(eu_percentreturns)
## mts [1:399, 1:4] -0.933 -0.44 0.903 -0.173 -0.47 ...
## - attr(*, "dimnames")=List of 2
## ..$ : NULL
## ..$ : chr [1:4] "DAX" "SMI" "CAC" "FTSE"
## - attr(*, "tsp")= num [1:3] 1991 1993 260
## - attr(*, "class")= chr [1:3] "mts" "ts" "matrix"
# Generate means from eu_percentreturns
colMeans(eu_percentreturns)
## DAX SMI CAC FTSE
## -0.01093221 0.05714059 0.01778921 0.03823335
# Use apply to calculate sample variance from eu_percentreturns
apply(eu_percentreturns, MARGIN = 2, FUN = var)
## DAX SMI CAC FTSE
## 0.9700197 0.7789079 1.3477730 0.8417013
# Use apply to calculate standard deviation from eu_percentreturns
apply(eu_percentreturns, MARGIN = 2, FUN = sd)
## DAX SMI CAC FTSE
## 0.9848958 0.8825576 1.1609363 0.9174428
# Display a histogram of percent returns for each index
par(mfrow = c(2,2))
apply(eu_percentreturns, MARGIN = 2, FUN = hist, main = "", xlab = "Percentage Return")
## $DAX
## $breaks
## [1] -10 -8 -6 -4 -2 0 2 4 6
##
## $counts
## [1] 1 0 1 4 208 178 5 2
##
## $density
## [1] 0.001253133 0.000000000 0.001253133 0.005012531 0.260651629 0.223057644
## [7] 0.006265664 0.002506266
##
## $mids
## [1] -9 -7 -5 -3 -1 1 3 5
##
## $xname
## [1] "newX[, i]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
##
## $SMI
## $breaks
## [1] -9 -8 -7 -6 -5 -4 -3 -2 -1 0 1 2 3 4
##
## $counts
## [1] 1 0 0 0 1 0 4 21 157 184 24 5 2
##
## $density
## [1] 0.002506266 0.000000000 0.000000000 0.000000000 0.002506266
## [6] 0.000000000 0.010025063 0.052631579 0.393483709 0.461152882
## [11] 0.060150376 0.012531328 0.005012531
##
## $mids
## [1] -8.5 -7.5 -6.5 -5.5 -4.5 -3.5 -2.5 -1.5 -0.5 0.5 1.5 2.5 3.5
##
## $xname
## [1] "newX[, i]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
##
## $CAC
## $breaks
## [1] -8 -7 -6 -5 -4 -3 -2 -1 0 1 2 3 4 5
##
## $counts
## [1] 1 0 0 1 4 8 38 154 128 52 8 3 2
##
## $density
## [1] 0.002506266 0.000000000 0.000000000 0.002506266 0.010025063
## [6] 0.020050125 0.095238095 0.385964912 0.320802005 0.130325815
## [11] 0.020050125 0.007518797 0.005012531
##
## $mids
## [1] -7.5 -6.5 -5.5 -4.5 -3.5 -2.5 -1.5 -0.5 0.5 1.5 2.5 3.5 4.5
##
## $xname
## [1] "newX[, i]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
##
## $FTSE
## $breaks
## [1] -5 -4 -3 -2 -1 0 1 2 3 4 5 6
##
## $counts
## [1] 1 1 4 25 178 148 34 4 2 1 1
##
## $density
## [1] 0.002506266 0.002506266 0.010025063 0.062656642 0.446115288
## [6] 0.370927318 0.085213033 0.010025063 0.005012531 0.002506266
## [11] 0.002506266
##
## $mids
## [1] -4.5 -3.5 -2.5 -1.5 -0.5 0.5 1.5 2.5 3.5 4.5 5.5
##
## $xname
## [1] "newX[, i]"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
# Display normal quantile plots of percent returns for each index
par(mfrow = c(2,2))
apply(eu_percentreturns, MARGIN = 2, FUN = qqnorm, main = "")
## $DAX
## $DAX$x
## [1] -1.362841938 -0.645200916 1.142773047 -0.332716397 -0.700349168
## [6] 1.522756851 0.880762990 -0.468960676 0.928046482 0.228125248
## [11] -0.862393206 -0.749370236 -0.757715106 0.399909659 0.346025823
## [16] 0.533070235 -0.957438716 -0.716497500 -0.774565430 0.138653062
## [21] 0.957438716 0.306270765 0.157708228 -0.125977957 0.434176329
## [26] -0.835371144 1.412187579 -0.170443132 0.386335624 -0.622163162
## [31] 1.794824260 -0.069160134 0.504322046 -0.100686285 -3.022583937
## [36] 1.861620217 3.022583937 1.465233793 1.130785550 1.192455456
## [41] -0.562265945 0.562265945 0.151350483 -0.176820835 0.454981140
## [46] 0.497200571 -0.724642010 -0.075458866 -0.306270765 -0.692343235
## [51] -1.073140638 0.332716397 0.461959623 0.176820835 -0.273510070
## [56] 0.577043938 -1.259028466 0.427283386 -0.791638608 0.247512940
## [61] 0.890060160 -0.215248044 -0.977501770 -0.221682051 -0.668586325
## [66] 0.475984791 -0.202406436 -0.037702589 -0.299693408 -1.465233793
## [71] -0.614557305 -1.301806749 0.094374049 0.189598120 0.253995872
## [76] 1.118958381 -1.412187579 -0.766113077 0.441089963 0.637484161
## [81] 0.732834875 1.095761965 -1.564098295 -0.119648113 0.783073486
## [86] 0.899434908 -0.319465652 -0.569639391 -0.684381435 -0.461959623
## [91] 0.937753841 -0.434176329 0.826498615 2.027546869 -0.483032470
## [96] 1.429424692 -0.783073486 0.997966220 -0.050279388 -1.735192204
## [101] -1.483865480 -0.031416549 1.107285697 -0.379575363 -0.808945725
## [106] 1.051055539 -1.154927051 -0.234577930 -1.522756851 -1.794824260
## [111] -0.081760594 0.652956285 0.352703444 -0.260489498 0.606986835
## [116] -1.018857387 -0.454981140 0.692343235 0.749370236 -0.504322046
## [121] 0.977501770 0.791638608 -0.997966220 -1.192455456 -0.366106357
## [126] -0.025131751 -0.018847945 -0.012564883 1.707553094 1.073140638
## [131] -0.006282318 0.000000000 1.585812035 0.668586325 0.119648113
## [136] -0.987682290 -1.029471543 1.395360129 1.154927051 0.716497500
## [141] 0.844309926 2.203366572 0.100686285 0.622163162 1.218437810
## [146] 0.241040394 -0.591949043 -0.741077227 -0.253995872 1.331704246
## [151] -0.151350483 -1.205344920 0.987682290 0.299693408 0.006282318
## [156] -0.577043938 0.724642010 -0.228125248 0.339363596 -0.326083868
## [161] -0.441089963 0.660751127 -0.406724252 -0.448024745 1.008356792
## [166] 0.808945725 -0.533070235 1.503027005 0.012564883 1.378918772
## [171] -1.051055539 1.898394677 0.215248044 0.170443132 0.273510070
## [176] 1.084381938 -0.164072354 0.018847945 -1.245269831 0.676462784
## [181] -0.062864145 -1.040202966 -1.331704246 0.908889378 -0.937753841
## [186] 0.871541335 -0.346025823 -0.399909659 0.614557305 -1.395360129
## [191] 0.234577930 0.312861400 -0.132312852 -1.631632667 1.205344920
## [196] 0.420410685 -0.967421566 1.287284949 -0.599450994 1.564098295
## [201] -0.157708228 -1.707553094 0.835371144 0.853316686 -0.386335624
## [206] 0.511469191 0.862393206 0.293128990 0.025131751 0.031416549
## [211] 0.113323060 0.629805182 -0.660751127 0.107002537 -0.293128990
## [216] -0.928046482 -0.094374049 -0.352703444 0.037702589 0.372832405
## [221] 0.280037647 0.967421566 0.684381435 -0.420410685 0.286577179
## [226] -0.286577179 -0.208822935 -0.899434908 -1.273029655 2.375107084
## [231] 0.359396830 1.362841938 -0.189598120 1.447097300 0.741077227
## [236] -1.008356792 -0.540325710 0.043990118 1.040202966 -0.676462784
## [241] 0.260489498 -1.084381938 0.591949043 -0.241040394 0.050279388
## [246] -0.393113587 0.525842714 -0.800262203 0.708400243 -1.062033337
## [251] 0.800262203 -0.629805182 0.056570646 0.266994125 -0.490104222
## [256] 0.195998259 -0.826498615 0.208822935 -0.947550382 0.366106357
## [261] -0.138653062 0.554922943 -0.043990118 1.231742970 -0.525842714
## [266] -0.312861400 -1.107285697 0.774565430 -0.844309926 -1.543097927
## [271] 0.483032470 -0.183205739 -0.339363596 -2.027546869 -2.375107084
## [276] 0.164072354 -1.503027005 -1.167254099 -0.871541335 0.547609740
## [281] -0.113323060 1.543097927 -1.378918772 -1.142773047 -0.372832405
## [286] 0.569639391 1.245269831 -0.606986835 -0.547609740 -2.203366572
## [291] -1.585812035 -1.218437810 0.125977957 1.018857387 -0.511469191
## [296] -1.861620217 -0.554922943 -1.608300307 1.681160057 -2.496817918
## [301] -1.827203533 1.029471543 2.496817918 0.490104222 0.947550382
## [306] -1.287284949 -0.144998850 2.079254280 -0.890060160 0.518642559
## [311] -0.247512940 -1.118958381 0.448024745 0.406724252 2.672947708
## [316] -1.764224226 1.179761118 -1.447097300 1.631632667 -2.280865771
## [321] -0.853316686 -1.316608391 -0.584480259 -1.980752397 -2.137203375
## [326] 0.379575363 -1.347109832 1.301806749 -0.918425797 -2.672947708
## [331] 1.655888698 1.167254099 1.062033337 -0.880762990 0.540325710
## [336] 2.137203375 -1.655888698 -0.056570646 0.599450994 1.827203533
## [341] 1.980752397 -0.817690678 1.273029655 1.483865480 0.645200916
## [346] 0.319465652 -2.079254280 -1.429424692 -0.413557785 -0.732834875
## [351] 0.144998850 -1.179761118 1.259028466 0.393113587 1.735192204
## [356] 0.766113077 -0.518642559 2.280865771 -0.107002537 0.132312852
## [361] 0.817690678 0.062864145 -0.652956285 0.584480259 -1.898394677
## [366] 0.413557785 -1.231742970 0.918425797 0.757715106 1.316608391
## [371] 0.221682051 -1.681160057 0.326083868 -0.475984791 -0.637484161
## [376] -1.130785550 -0.280037647 -1.937931511 -0.427283386 0.183205739
## [381] 0.202406436 -0.266994125 -0.497200571 1.937931511 1.764224226
## [386] 0.468960676 0.700349168 0.069160134 0.075458866 1.608300307
## [391] -0.195998259 -0.708400243 0.081760594 0.088065570 -0.088065570
## [396] 1.347109832 -0.359396830 -1.095761965 -0.908889378
##
## $DAX$y
## [1] -0.933202358 -0.440009916 0.902583256 -0.172732881 -0.469657644
## [6] 1.254190985 0.576404219 -0.286550421 0.635891165 0.115438362
## [11] -0.576526277 -0.512726607 -0.515369041 0.197348134 0.178494491
## [16] 0.270336692 -0.661764706 -0.481125093 -0.520639643 0.049844237
## [21] 0.678789388 0.160821426 0.074106095 -0.049367479 0.209915416
## [26] -0.560655536 1.109045849 -0.091917397 0.190137390 -0.428527701
## [31] 1.469412850 -0.018177412 0.242409551 -0.030227919 -9.179970972
## [36] 1.498202157 5.208948370 1.172289081 0.875192604 0.940917700
## [41] -0.381332849 0.279499332 0.066650509 -0.096881623 0.218195042
## [46] 0.241911098 -0.482654600 -0.024249773 -0.163725669 -0.467687075
## [51] -0.787209373 0.172222906 0.221048754 0.091900502 -0.153026872
## [56] 0.282000981 -0.886416432 0.209708259 -0.529328491 0.123754718
## [61] 0.599468512 -0.122865217 -0.676589986 -0.123854347 -0.452628968
## [66] 0.236686391 -0.111849873 0.000000000 -0.161741835 -0.984485015
## [71] -0.427915172 -0.903747709 0.012755102 0.095651065 0.127412881
## [76] 0.871667621 -0.971363694 -0.515923567 0.211281132 0.325836954
## [81] 0.426670063 0.843373494 -1.106709426 -0.038150951 0.451625215
## [86] 0.601570415 -0.169950274 -0.390920555 -0.455753893 -0.286150324
## [91] 0.644091576 -0.272462299 0.520998793 1.776120346 -0.310520432
## [96] 1.158734114 -0.523463481 0.730514456 -0.006145904 -1.309157959
## [101] -0.996450146 0.000000000 0.855507328 -0.218299757 -0.556319540
## [106] 0.792004526 -0.848144684 -0.125794075 -1.058001134 -1.355738018
## [111] -0.025809782 0.342067897 0.180099054 -0.141252006 0.289333248
## [116] -0.730862931 -0.284164299 0.401554404 0.438653077 -0.333975594
## [121] 0.715298363 0.473478789 -0.700503089 -0.859359969 -0.200530435
## [126] 0.000000000 0.000000000 0.000000000 1.393570132 0.831042639
## [131] 0.000000000 0.000000000 1.325049135 0.362908272 0.043640898
## [136] -0.685486384 -0.752964799 1.100082190 0.919267088 0.415169166
## [141] 0.555384141 2.098803314 0.024042796 0.312481221 0.946504523
## [146] 0.118687318 -0.414913165 -0.494018213 -0.137576265 1.030248577
## [151] -0.071144839 -0.860278849 0.724117295 0.154476858 0.000000000
## [156] -0.409325503 0.416964498 -0.124569937 0.172239710 -0.171943555
## [161] -0.273207816 0.351378715 -0.249258160 -0.273679200 0.733802649
## [166] 0.473793308 -0.371352785 1.248372974 0.000000000 1.086893005
## [171] -0.768830568 1.561225679 0.103246530 0.091680037 0.143118846
## [176] 0.834619562 -0.090708090 0.000000000 -0.879532429 0.383558507
## [181] -0.017108640 -0.758612822 -0.919593080 0.609084054 -0.651522140
## [186] 0.568742383 -0.173120203 -0.242788600 0.301326998 -0.964815992
## [191] 0.116672500 0.168977975 -0.052352975 -1.129088581 0.941841300
## [196] 0.204105435 -0.675085841 0.978496514 -0.417778809 1.316862836
## [201] -0.080515298 -1.300794290 0.548168883 0.556779956 -0.219171761
## [206] 0.260115607 0.565004324 0.149056928 0.000000000 0.000000000
## [211] 0.034346557 0.320457797 -0.450630312 0.028650011 -0.160394111
## [216] -0.642607149 -0.028873361 -0.179066543 0.000000000 0.185174469
## [221] 0.144400162 0.692121352 0.389506244 -0.256761383 0.148732910
## [226] -0.159936026 -0.120144173 -0.601443464 -0.887454619 2.540845398
## [231] 0.181447040 1.052750736 -0.100817744 1.166180758 0.437818665
## [236] -0.706284831 -0.372325646 0.000000000 0.758589915 -0.453941541
## [241] 0.133466800 -0.799733422 0.285522338 -0.128398370 0.000000000
## [246] -0.240357742 0.268952765 -0.547639005 0.410181491 -0.772243984
## [251] 0.473719829 -0.437808711 0.000000000 0.140940354 -0.315262062
## [256] 0.096007229 -0.558564658 0.102127660 -0.651816584 0.182565039
## [261] -0.056947608 0.279202279 -0.005682141 0.948971474 -0.365887982
## [266] -0.169491525 -0.831918506 0.450836044 -0.562436087 -1.079814889
## [271] 0.236802588 -0.097954480 -0.173030338 -1.808412295 -2.753751103
## [276] 0.084709869 -1.045885980 -0.855327468 -0.579245748 0.272716003
## [281] -0.037087403 1.261439525 -0.940400586 -0.844532117 -0.211377059
## [286] 0.280356364 0.950546720 -0.418487292 -0.376985353 -2.047146402
## [291] -1.114629512 -0.864608685 0.045222560 0.749063670 -0.346109473
## [296] -1.472858245 -0.378614792 -1.120503244 1.391650099 -2.941176471
## [301] -1.414141414 0.758196721 2.779472578 0.237451355 0.677765348
## [306] -0.888888889 -0.059351095 1.873968987 -0.595893516 0.260637258
## [311] -0.129979853 -0.839461183 0.216563854 0.202999149 4.659521631
## [316] -1.323758976 0.936531038 -0.977995110 1.335865780 -2.161689366
## [321] -0.574712644 -0.918432884 -0.414857069 -1.666341209 -1.965976038
## [326] 0.189061445 -0.923305028 0.999931977 -0.633081897 -4.954588586
## [331] 1.369179206 0.921561731 0.815558344 -0.594620756 0.271266606
## [336] 2.081021088 -1.195977168 -0.013755158 0.288898060 1.481481481
## [341] 1.709921600 -0.558176623 0.968927497 1.184645930 0.333573157
## [346] 0.169491525 -1.848236366 -0.974671794 -0.254435889 -0.490031550
## [351] 0.060712358 -0.856199016 0.965592275 0.195312500 1.404853129
## [356] 0.444120377 -0.362964429 2.152602994 -0.032419114 0.045401479
## [361] 0.505672609 0.000000000 -0.445075147 0.285084878 -1.563509497
## [366] 0.203465477 -0.871160018 0.634333289 0.439921208 1.006733346
## [371] 0.110025241 -1.208947505 0.170145933 -0.293983145 -0.438998821
## [376] -0.842382363 -0.159288511 -1.568836003 -0.270142500 0.094805986
## [381] 0.101481632 -0.141930251 -0.331641286 1.561863371 1.464295266
## [386] 0.224052718 0.407653363 0.000000000 0.000000000 1.335865366
## [391] -0.109854604 -0.478716522 0.000000000 0.000000000 -0.026001040
## [396] 1.040312094 -0.180180180 -0.825167612 -0.604524181
##
##
## $SMI
## $SMI$x
## [1] 0.826498615 -0.997966220 0.386335624 0.113323060 -1.378918772
## [6] 0.918425797 1.735192204 -0.629805182 1.608300307 0.577043938
## [11] 0.525842714 -0.379575363 0.413557785 -0.050279388 -0.599450994
## [16] 0.676462784 -1.287284949 -0.062864145 -0.511469191 0.062864145
## [21] 0.208822935 0.050279388 -0.195998259 0.234577930 -0.749370236
## [26] -0.732834875 0.511469191 -0.215248044 0.346025823 -0.645200916
## [31] 0.088065570 0.700349168 0.332716397 -1.142773047 -3.022583937
## [36] 2.496817918 3.022583937 1.301806749 1.412187579 0.622163162
## [41] -0.928046482 0.716497500 0.899434908 -0.247512940 -0.454981140
## [46] -0.260489498 -0.468960676 -0.018847945 -0.774565430 -0.533070235
## [51] -0.684381435 0.221682051 -0.056570646 0.247512940 -1.429424692
## [56] -0.189598120 -1.465233793 0.075458866 0.144998850 -0.660751127
## [61] -0.504322046 -0.724642010 -0.987682290 0.497200571 -0.525842714
## [66] 0.591949043 1.259028466 -0.253995872 -0.202406436 -1.062033337
## [71] 0.037702589 -0.483032470 -0.844309926 0.504322046 0.006282318
## [76] 0.183205739 1.395360129 0.195998259 1.631632667 -0.346025823
## [81] -0.826498615 0.454981140 -0.273510070 -1.827203533 0.800262203
## [86] 0.000000000 -0.708400243 -0.899434908 -0.540325710 -0.577043938
## [91] 0.774565430 0.260489498 0.372832405 1.861620217 -0.366106357
## [96] 1.245269831 -0.427283386 0.406724252 -0.266994125 -2.203366572
## [101] -1.764224226 -0.393113587 -0.591949043 -1.681160057 -1.608300307
## [106] 1.681160057 -1.231742970 -0.406724252 -1.447097300 -2.027546869
## [111] 1.447097300 1.051055539 -1.395360129 -0.461959623 0.241040394
## [116] -0.800262203 -1.130785550 1.142773047 1.429424692 0.012564883
## [121] 0.228125248 -0.025131751 -1.218437810 -1.245269831 -0.475984791
## [126] 1.980752397 -0.183205739 -0.176820835 1.937931511 1.192455456
## [131] -0.170443132 -0.164072354 -0.157708228 2.137203375 0.599450994
## [136] -1.095761965 -1.018857387 1.827203533 0.606986835 -0.441089963
## [141] 1.205344920 2.375107084 -1.107285697 -1.192455456 0.132312852
## [146] 0.299693408 -0.234577930 -0.399909659 0.018847945 1.378918772
## [151] -0.286577179 -1.483865480 -0.359396830 0.468960676 -0.241040394
## [156] -0.094374049 0.908889378 0.957438716 0.107002537 0.379575363
## [161] 0.997966220 -0.434176329 -0.006282318 -1.301806749 0.684381435
## [166] 0.253995872 -0.908889378 1.231742970 0.176820835 0.928046482
## [171] -0.569639391 0.853316686 0.427283386 0.937753841 -0.069160134
## [176] 0.280037647 0.138653062 -1.008356792 -1.543097927 -0.518642559
## [181] 0.312861400 -1.179761118 -1.362841938 1.107285697 -0.716497500
## [186] 1.522756851 0.056570646 0.967421566 0.151350483 -1.585812035
## [191] 0.366106357 0.393113587 0.319465652 -0.668586325 0.741077227
## [196] -0.306270765 -0.957438716 0.817690678 -0.835371144 1.029471543
## [201] 0.533070235 -1.937931511 0.043990118 1.465233793 -0.700349168
## [206] 0.977501770 1.543097927 -0.319465652 -0.151350483 -0.144998850
## [211] -0.977501770 1.018857387 0.286577179 -0.547609740 -0.031416549
## [216] -1.347109832 1.073140638 0.352703444 -0.138653062 1.764224226
## [221] 0.890060160 1.218437810 -0.221682051 0.862393206 1.008356792
## [226] -1.084381938 0.081760594 -0.757715106 -1.794824260 1.287284949
## [231] -0.967421566 0.273510070 -0.692343235 0.339363596 -0.326083868
## [236] -0.420410685 -0.880762990 -0.132312852 0.399909659 0.100686285
## [241] 0.490104222 -1.259028466 0.031416549 -0.339363596 -0.125977957
## [246] -1.154927051 -1.040202966 -1.522756851 0.766113077 -1.205344920
## [251] 1.118958381 -0.766113077 -0.947550382 0.461959623 -1.735192204
## [256] -0.088065570 -0.817690678 0.540325710 -0.937753841 0.791638608
## [261] 0.584480259 -0.862393206 0.652956285 1.154927051 -0.741077227
## [266] -0.584480259 -1.564098295 0.637484161 0.708400243 -1.273029655
## [271] -0.652956285 0.692343235 -0.562265945 -2.137203375 -2.496817918
## [276] 1.898394677 -1.980752397 0.359396830 -0.614557305 0.293128990
## [281] 1.316608391 2.079254280 -0.637484161 -0.783073486 -0.372832405
## [286] 0.757715106 1.095761965 -0.228125248 -0.918425797 -0.890060160
## [291] -0.075458866 -0.676462784 -0.808945725 0.518642559 0.614557305
## [296] -1.073140638 -1.118958381 -1.898394677 1.130785550 -2.375107084
## [301] -1.655888698 -0.554922943 2.280865771 0.306270765 0.420410685
## [306] -0.490104222 0.783073486 1.273029655 -0.081760594 1.062033337
## [311] -0.208822935 -0.853316686 1.040202966 1.503027005 2.672947708
## [316] -1.707553094 1.655888698 1.167254099 0.554922943 -0.791638608
## [321] 0.808945725 0.947550382 -0.386335624 -0.037702589 -2.079254280
## [326] -0.012564883 -0.280037647 0.483032470 -1.412187579 -2.672947708
## [331] 2.027546869 2.203366572 0.871541335 0.562265945 0.125977957
## [336] 0.835371144 -0.413557785 -1.051055539 0.448024745 1.564098295
## [341] 1.362841938 -1.167254099 0.119648113 0.668586325 0.660751127
## [346] -0.332716397 -1.503027005 -0.312861400 0.266994125 0.724642010
## [351] 0.844309926 0.569639391 0.629805182 0.170443132 0.025131751
## [356] 0.215248044 -1.316608391 0.157708228 -1.631632667 -1.331704246
## [361] -1.861620217 -0.622163162 0.880762990 0.069160134 -0.606986835
## [366] -1.029471543 0.475984791 0.202406436 1.707553094 1.179761118
## [371] -0.293128990 -0.497200571 0.094374049 0.547609740 1.483865480
## [376] 0.434176329 1.347109832 0.441089963 0.326083868 1.585812035
## [381] -0.871541335 1.331704246 0.189598120 1.084381938 -0.043990118
## [386] 0.732834875 0.164072354 -0.119648113 -0.113323060 1.794824260
## [391] -0.299693408 0.987682290 -0.107002537 -0.100686285 0.749370236
## [396] 0.645200916 -0.352703444 -2.280865771 -0.448024745
##
## $SMI$y
## [1] 0.619748525 -0.586319218 0.327653997 0.148447242 -0.889363216
## [6] 0.675999043 1.230019609 -0.358065274 1.107511046 0.436986541
## [11] 0.388676181 -0.179138977 0.341553780 0.040385392 -0.328719723
## [16] 0.509170862 -0.805940936 0.029017469 -0.266883268 0.127981385
## [21] 0.238205903 0.110125775 0.000000000 0.254747568 -0.433125433
## [26] -0.417609187 0.378589318 -0.023209934 0.313406849 -0.364498959
## [31] 0.139364729 0.527689185 0.311490540 -0.736055204 -8.040783223
## [36] 2.721431271 3.366858825 0.937407298 0.981602304 0.477299185
## [41] -0.556134863 0.535943143 0.672152045 -0.046045816 -0.241851895
## [46] -0.069268067 -0.248382625 0.052116509 -0.439865725 -0.279037321
## [51] -0.390579457 0.251653304 0.029188558 0.262620368 -0.948777648
## [56] 0.000000000 -0.987248046 0.136506618 0.201517307 -0.372648764
## [61] -0.261236122 -0.416691470 -0.585809074 0.372797787 -0.275564608
## [66] 0.456538716 0.920887401 -0.053327013 -0.011856770 -0.640341515
## [71] 0.095476787 -0.250387504 -0.490078891 0.378378378 0.065817029
## [76] 0.227218369 0.978403532 0.230414747 1.155319776 -0.163160655
## [81] -0.478608533 0.363615037 -0.087652662 -1.239911101 0.609972758
## [86] 0.064747778 -0.411764706 -0.537507383 -0.279113962 -0.315626489
## [91] 0.597407253 0.267236772 0.325752191 1.593954779 -0.174327387
## [96] 0.913906514 -0.224965390 0.341099613 -0.086425444 -2.006804683
## [101] -1.218148649 -0.184677708 -0.322291853 -1.071792108 -1.041036194
## [106] 1.217125382 -0.779503293 -0.200974421 -0.982486117 -1.633181314
## [111] 1.021239271 0.781443810 -0.892307692 -0.242160820 0.255197311
## [116] -0.471844540 -0.723598029 0.860823123 1.009220035 0.067842605
## [121] 0.252696456 0.049182344 -0.768096350 -0.780234070 -0.249641141
## [126] 1.795657887 0.000000000 0.000000000 1.763982790 0.869722776
## [131] 0.000000000 0.000000000 0.000000000 2.029818574 0.457746479
## [136] -0.660123846 -0.599823581 1.484943501 0.472193075 -0.237888019
## [141] 0.872397348 2.214022140 -0.705099278 -0.744191331 0.171703297
## [146] 0.291395269 -0.039879223 -0.193776359 0.074234810 0.964336662
## [151] -0.101729400 -0.995700385 -0.165714286 0.366321332 -0.039920160
## [156] 0.005705157 0.673170175 0.685668952 0.140702386 0.325970887
## [161] 0.733852445 -0.228005784 0.061312078 -0.824420677 0.511121096
## [166] 0.262643196 -0.546204437 0.879847568 0.216654630 0.676274945
## [171] -0.313842088 0.629660315 0.345792854 0.678262772 0.027165055
## [176] 0.282439846 0.200400802 -0.594594595 -1.022294725 -0.274695088
## [181] 0.297487880 -0.741513787 -0.868795307 0.826169476 -0.415236408
## [186] 1.084116306 0.126498735 0.703103543 0.207276496 -1.028795384
## [191] 0.324496755 0.328929335 0.300530026 -0.375898889 0.574178378
## [196] -0.108742932 -0.571521881 0.618601850 -0.484221980 0.765403750
## [201] 0.401497477 -1.264523102 0.109463084 1.033295063 -0.405844156
## [206] 0.722629720 1.095048009 -0.149405048 0.000000000 0.000000000
## [211] -0.577138887 0.763235689 0.288046087 -0.287218765 0.048007681
## [216] -0.858391981 0.790535090 0.314800982 0.000000000 1.468007021
## [221] 0.670965036 0.874772195 -0.025809116 0.635068154 0.749063670
## [226] -0.656923155 0.138404757 -0.435116458 -1.228791774 0.936963198
## [231] -0.572430509 0.280082988 -0.398262129 0.311575012 -0.155303619
## [236] -0.217763260 -0.535203949 0.000000000 0.334343329 0.140581068
## [241] 0.369157178 -0.787401575 0.088763576 -0.161719443 0.000000000
## [246] -0.736754102 -0.615886719 -1.016949153 0.593964041 -0.755359328
## [251] 0.836147291 -0.435868814 -0.565906786 0.365100671 -1.198309528
## [256] 0.010828957 -0.476422500 0.407985639 -0.558023621 0.604739853
## [261] 0.444059352 -0.512184602 0.498563919 0.862766244 -0.422346966
## [266] -0.322130355 -1.023376064 0.495211145 0.530676342 -0.791812550
## [271] -0.369204040 0.512261580 -0.298199957 -1.810865191 -2.713779353
## [276] 1.679380622 -1.623649292 0.324398156 -0.340367597 0.290300546
## [281] 0.942164709 1.922968794 -0.364097755 -0.442943359 -0.177965630
## [286] 0.584990807 0.825301872 -0.038455200 -0.549571334 -0.536030062
## [291] 0.022223457 -0.388824085 -0.473986505 0.386597938 0.474409778
## [296] -0.644372847 -0.721234485 -1.250211184 0.838323353 -2.279153942
## [301] -1.059089068 -0.292466074 2.111932418 0.293002413 0.343701667
## [306] -0.251184564 0.600927145 0.932984412 0.011272686 0.783363390
## [311] -0.016775709 -0.497762864 0.770052274 1.070950469 3.316777042
## [316] -1.105710165 1.188289943 0.864737910 0.418077900 -0.458498024
## [321] 0.614146548 0.684066512 -0.182920456 0.047122886 -1.653757588
## [326] 0.053214134 -0.101053079 0.367353458 -0.933587948 -4.262154637
## [331] 1.817673378 2.070859654 0.651167797 0.433085601 0.165034072
## [336] 0.621844273 -0.206000423 -0.635155878 0.356895541 1.098726115
## [341] 0.945030713 -0.738544755 0.151951795 0.507481427 0.504919057
## [346] -0.155376010 -1.011515717 -0.125766389 0.272836980 0.549421799
## [351] 0.624479600 0.434422838 0.489186406 0.215219062 0.086925398
## [356] 0.250332073 -0.825561841 0.210677766 -1.051174239 -0.844690885
## [361] -1.249085398 -0.354591162 0.669216061 0.131898280 -0.337214816
## [366] -0.602696273 0.367001755 0.233174351 1.226604631 0.867021832
## [371] -0.103562552 -0.253991292 0.140310762 0.415153088 1.054263566
## [376] 0.347754935 0.942819284 0.353410410 0.301856417 1.103475949
## [381] -0.530832961 0.942643392 0.227283957 0.808479172 0.044011932
## [386] 0.562127285 0.213872551 0.000000000 0.000000000 1.484212058
## [391] -0.105147445 0.732022391 0.000000000 0.000000000 0.584212026
## [396] 0.495820938 -0.164458228 -2.108532969 -0.240396173
##
##
## $CAC
## $CAC$x
## [1] -1.287284949 -1.707553094 -0.676462784 0.871541335 -0.629805182
## [6] 1.130785550 1.273029655 -0.312861400 0.075458866 0.372832405
## [11] -0.352703444 0.221682051 -0.119648113 0.393113587 -0.125977957
## [16] 0.406724252 0.584480259 0.176820835 -0.835371144 -0.668586325
## [21] -0.151350483 -0.202406436 0.741077227 -0.094374049 -0.346025823
## [26] -0.260489498 1.259028466 0.490104222 -0.454981140 -0.293128990
## [31] 1.522756851 0.692343235 -0.081760594 -0.075458866 -3.022583937
## [36] 1.980752397 2.375107084 1.395360129 0.826498615 1.040202966
## [41] -0.319465652 0.183205739 0.379575363 0.427283386 -0.132312852
## [46] 0.591949043 -0.800262203 0.511469191 -0.540325710 -0.339363596
## [51] -0.977501770 0.189598120 0.987682290 0.844309926 0.170443132
## [56] 0.043990118 -0.577043938 0.668586325 0.339363596 0.228125248
## [61] -0.273510070 -0.490104222 -0.157708228 0.483032470 -0.280037647
## [66] 0.434176329 -0.591949043 -0.716497500 -0.947550382 -0.554922943
## [71] 0.326083868 -0.359396830 0.056570646 -0.652956285 -0.100686285
## [76] 1.231742970 0.420410685 -0.518642559 0.533070235 -1.073140638
## [81] -0.606986835 -0.461959623 0.132312852 0.253995872 0.928046482
## [86] -0.215248044 0.700349168 -0.372832405 -0.069160134 -0.967421566
## [91] 0.359396830 -0.708400243 0.208822935 0.997966220 -0.062864145
## [96] 0.113323060 -0.757715106 1.118958381 -0.399909659 -2.137203375
## [101] -1.980752397 -0.189598120 0.215248044 -1.483865480 0.195998259
## [106] 1.412187579 -0.774565430 -0.286577179 -0.987682290 -1.655888698
## [111] 0.808945725 -0.584480259 -0.766113077 -1.142773047 -1.378918772
## [116] -1.564098295 0.606986835 1.503027005 0.977501770 0.569639391
## [121] 0.862393206 -0.434176329 -1.331704246 -1.861620217 -1.029471543
## [126] 2.672947708 -0.056570646 1.316608391 -0.195998259 1.192455456
## [131] 1.301806749 -0.050279388 -1.018857387 1.095761965 0.957438716
## [136] -0.599450994 0.475984791 2.137203375 0.266994125 -0.844309926
## [141] 1.029471543 1.564098295 -0.853316686 -0.043990118 0.366106357
## [146] 0.151350483 -1.503027005 -0.241040394 0.676462784 1.008356792
## [151] 1.073140638 -0.692343235 -0.420410685 0.138653062 -1.062033337
## [156] -0.113323060 0.562265945 -0.386335624 0.119648113 0.299693408
## [161] -0.164072354 -0.234577930 0.835371144 -0.724642010 0.908889378
## [166] 1.543097927 0.247512940 1.794824260 0.645200916 0.757715106
## [171] -1.218437810 1.347109832 0.273510070 -0.176820835 -0.306270765
## [176] 0.286577179 0.312861400 -0.880762990 -0.427283386 0.547609740
## [181] 0.880762990 -1.347109832 -1.395360129 0.346025823 -1.040202966
## [186] 0.774565430 -0.918425797 -0.441089963 -0.253995872 -1.681160057
## [191] 0.684381435 1.631632667 0.386335624 -0.700349168 1.018857387
## [196] -0.144998850 -0.871541335 0.766113077 0.107002537 1.608300307
## [201] 0.088065570 -2.375107084 1.827203533 1.483865480 -0.660751127
## [206] 0.525842714 0.918425797 -0.783073486 -0.037702589 -0.031416549
## [211] 0.164072354 0.241040394 -0.393113587 0.399909659 0.622163162
## [216] 0.629805182 1.447097300 -0.379575363 -0.025131751 0.577043938
## [221] 0.332716397 0.062864145 0.817690678 -0.018847945 0.724642010
## [226] -0.817690678 -0.622163162 -1.631632667 0.413557785 1.287284949
## [231] -1.465233793 0.461959623 -0.826498615 1.331704246 0.125977957
## [236] -0.957438716 -0.890060160 -0.012564883 0.937753841 -0.899434908
## [241] 0.448024745 -1.585812035 0.202406436 -0.808945725 -0.006282318
## [246] -1.084381938 -0.525842714 -1.316608391 -0.107002537 -0.645200916
## [251] 0.749370236 -1.231742970 -1.008356792 1.107285697 -1.522756851
## [256] 0.599450994 0.280037647 1.062033337 -0.366106357 -0.511469191
## [261] -0.468960676 -1.154927051 -0.504322046 0.614557305 -0.928046482
## [266] 0.637484161 -1.608300307 0.783073486 -0.221682051 0.000000000
## [271] 0.006282318 -0.448024745 -0.208822935 -2.027546869 -1.735192204
## [276] -0.406724252 -1.794824260 0.497200571 0.012564883 1.179761118
## [281] 0.800262203 1.764224226 -1.167254099 -1.543097927 1.655888698
## [286] 0.260489498 0.660751127 -0.332716397 -1.051055539 -1.273029655
## [291] -1.107285697 -0.533070235 -0.547609740 1.681160057 0.306270765
## [296] -1.245269831 -0.247512940 -0.684381435 1.084381938 -2.280865771
## [301] -1.362841938 0.050279388 1.154927051 -0.088065570 -0.266994125
## [306] -0.791638608 1.937931511 3.022583937 -0.170443132 0.069160134
## [311] -0.997966220 -0.497200571 0.967421566 1.465233793 2.496817918
## [316] -1.764224226 1.245269831 -0.475984791 1.585812035 -2.203366572
## [321] 0.441089963 -0.326083868 0.094374049 0.791638608 -2.496817918
## [326] -1.898394677 0.352703444 -0.862393206 -1.937931511 -2.672947708
## [331] 0.144998850 2.079254280 1.167254099 -1.118958381 -0.228125248
## [336] 1.707553094 -1.130785550 -1.179761118 0.853316686 0.890060160
## [341] 2.280865771 -0.637484161 0.554922943 1.861620217 0.319465652
## [346] 0.293128990 -1.447097300 -1.192455456 0.716497500 0.018847945
## [351] 2.027546869 0.947550382 0.100686285 -0.749370236 -0.483032470
## [356] 0.708400243 0.025131751 1.218437810 -1.429424692 -1.412187579
## [361] -0.569639391 -0.413557785 -1.827203533 0.157708228 -2.079254280
## [366] 2.203366572 0.081760594 1.051055539 0.652956285 1.205344920
## [371] 1.142773047 -0.614557305 0.899434908 -1.095761965 0.468960676
## [376] -1.301806749 1.362841938 -1.259028466 -0.732834875 -1.205344920
## [381] 0.454981140 -0.562265945 -0.183205739 1.378918772 1.429424692
## [386] 1.898394677 -0.299693408 1.735192204 0.031416549 0.234577930
## [391] 0.732834875 -0.741077227 -0.138653062 0.037702589 -0.908889378
## [396] 0.504322046 0.540325710 -0.937753841 0.518642559
##
## $CAC$y
## [1] -1.257897112 -1.856612396 -0.576251455 0.878168725 -0.510707446
## [6] 1.178323514 1.320265206 -0.193467623 0.017103763 0.313515362
## [11] -0.244345948 0.148105953 -0.034127752 0.341394026 -0.039693791
## [16] 0.346040390 0.520097236 0.112479613 -0.696590079 -0.554392714
## [21] -0.073951874 -0.108163498 0.689576566 -0.016979851 -0.243419190
## [26] -0.153217569 1.301506110 0.398339318 -0.335289187 -0.173815531
## [31] 1.595147158 0.635780628 0.000000000 0.000000000 -7.295500742
## [36] 2.257777778 3.900092721 1.461319650 0.791600242 1.085355877
## [41] -0.199633107 0.113531924 0.318608921 0.355278032 -0.042911549
## [46] 0.520525892 -0.661968823 0.424548581 -0.444159041 -0.241883466
## [51] -0.829786088 0.114099430 1.009443178 0.805931657 0.111928366
## [56] 0.005323963 -0.479131175 0.588424093 0.271218890 0.153805357
## [61] -0.169455624 -0.387226819 -0.074551361 0.389022116 -0.169869413
## [66] 0.361586728 -0.482144749 -0.596283874 -0.814096728 -0.448188347
## [71] 0.244087655 -0.254315243 0.010849517 -0.531568670 -0.021812630
## [76] 1.281771572 0.350045775 -0.423956209 0.468876314 -0.976290098
## [81] -0.498374865 -0.348432056 0.065559441 0.174710636 0.942882058
## [86] -0.124183359 0.648718780 -0.273928456 0.000000000 -0.818656757
## [91] 0.293239207 -0.584763658 0.136158161 1.033394974 0.000000000
## [96] 0.043066322 -0.624192854 1.175005415 -0.283649987 -2.946543581
## [101] -2.388984129 -0.090646422 0.141763538 -1.398640997 0.114856716
## [106] 1.474215568 -0.627473149 -0.170658172 -0.866146219 -1.776168305
## [111] 0.784176030 -0.481941702 -0.624307136 -1.027477689 -1.322892567
## [116] -1.593122520 0.531492455 1.579970831 0.999042833 0.503465024
## [121] 0.878123527 -0.321317988 -1.289415074 -2.119700748 -0.903851987
## [126] 4.009549461 0.000000000 1.377199694 -0.104499274 1.232056721
## [131] 1.366324129 0.000000000 -0.894829246 1.165780902 0.977235497
## [136] -0.497874245 0.387923765 2.704973118 0.190850101 -0.712963971
## [141] 1.068903141 1.616227357 -0.715200683 0.000000000 0.306418665
## [146] 0.107186880 -1.424059104 -0.141204584 0.609126013 1.048705335
## [151] 1.139463970 -0.581825875 -0.303256012 0.074710497 -0.975843865
## [156] -0.026925148 0.500942634 -0.278700825 0.053746103 0.214868930
## [161] -0.080403087 -0.134112977 0.800386764 -0.607513989 0.911479277
## [166] 1.599277403 0.167346512 1.869061293 0.574005740 0.718507949
## [171] -1.163673160 1.423086767 0.191793267 -0.085638003 -0.186548351
## [176] 0.207102086 0.216755721 -0.759519139 -0.309173847 0.477909401
## [181] 0.890553054 -1.298961834 -1.331300813 0.272942631 -0.919315906
## [186] 0.746423388 -0.792344104 -0.321543408 -0.150884495 -1.797717680
## [191] 0.620821394 1.687496704 0.337084479 -0.584039694 1.055367819
## [196] -0.072023871 -0.736202636 0.720916965 0.036045314 1.652339527
## [201] 0.030382824 -3.437278526 1.892529489 1.569252933 -0.547084747
## [206] 0.458411858 0.932920955 -0.627919827 0.000000000 0.000000000
## [211] 0.111212213 0.166633003 -0.282300751 0.343764218 0.544108016
## [216] 0.546174275 1.500049836 -0.274954583 0.000000000 0.516961253
## [221] 0.259600313 0.014656310 0.791324736 0.000000000 0.683338180
## [226] -0.669073406 -0.503973638 -1.763101500 0.347050074 1.363636364
## [231] -1.389159680 0.380604024 -0.689383494 1.403213011 0.058676837
## [236] -0.816107120 -0.763697280 0.000000000 0.953279380 -0.781980032
## [241] 0.371765639 -1.595140501 0.115427080 -0.666700085 0.000000000
## [246] -0.979006863 -0.433187239 -1.274504786 -0.025922854 -0.528963336
## [251] 0.693394505 -1.170135653 -0.890611903 1.168199598 -1.468206280
## [256] 0.530278927 0.200443085 1.089703095 -0.260375983 -0.417689135
## [261] -0.351281917 -1.036514785 -0.393428678 0.539097945 -0.801656403
## [266] 0.567299973 -1.665691022 0.757657755 -0.128907509 0.000000000
## [271] 0.000000000 -0.328062816 -0.113311390 -2.668539326 -1.881451881
## [276] -0.294134284 -1.996936518 0.410998553 0.000000000 1.199123717
## [281] 0.774752193 1.842849067 -1.054618117 -1.565129586 1.692597025
## [286] 0.179331988 0.581785634 -0.233592881 -0.919834987 -1.243459180
## [291] -0.991340018 -0.443088963 -0.445060979 1.805620065 0.216709438
## [296] -1.172252888 -0.143951172 -0.576634760 1.159958242 -3.124641670
## [301] -1.302006273 0.005996282 1.187192709 -0.005925575 -0.154074074
## [306] -0.635052525 2.221956755 4.037630011 -0.084245998 0.016863406
## [311] -0.876749283 -0.391222997 0.990437158 1.566903393 3.973362930
## [316] -1.996157131 1.285263043 -0.381761480 1.630053436 -3.016623294
## [321] 0.366902141 -0.212789175 0.032806605 0.765236403 -3.970707893
## [326] -2.169123877 0.277152261 -0.731271953 -2.360788863 -4.295134557
## [331] 0.093109870 2.586046512 1.190907992 -0.991696039 -0.132746033
## [336] 1.812579300 -1.020710937 -1.073205828 0.860606061 0.895325081
## [341] 3.108808290 -0.525616589 0.493554756 2.062749177 0.243432971
## [346] 0.214604394 -1.386306002 -1.085776330 0.664394246 0.000000000
## [351] 2.553948577 0.962560860 0.033257580 -0.615060675 -0.384701160
## [356] 0.660435440 0.000000000 1.278843481 -1.356025254 -1.341273375
## [361] -0.468212331 -0.294717751 -2.097544338 0.110317599 -2.865096857
## [366] 2.734654884 0.023247704 1.086577571 0.574811749 1.240212608
## [371] 1.179857740 -0.502148078 0.902820613 -0.983661220 0.387270584
## [376] -1.269149055 1.432697208 -1.211478339 -0.610341904 -1.159947689
## [381] 0.373928551 -0.464236589 -0.086370703 1.434993084 1.488551787
## [386] 2.132900409 -0.180881386 1.839547526 0.000000000 0.156367950
## [391] 0.689098250 -0.614874619 -0.053798149 0.000000000 -0.791258478
## [396] 0.417774402 0.475470067 -0.812002581 0.439143399
##
##
## $FTSE
## $FTSE$x
## [1] 0.800262203 -0.577043938 1.130785550 0.668586325 -0.977501770
## [6] 1.040202966 0.977501770 0.125977957 -0.652956285 1.655888698
## [11] 1.218437810 0.247512940 -0.692343235 -0.247512940 0.774565430
## [16] 1.412187579 -0.312861400 -0.031416549 0.434176329 0.326083868
## [21] 0.075458866 -0.253995872 0.164072354 0.454981140 -0.800262203
## [26] -0.547609740 1.192455456 0.183205739 -1.631632667 -0.056570646
## [31] 0.700349168 1.154927051 0.399909659 0.215248044 -2.672947708
## [36] 0.637484161 2.027546869 0.947550382 0.783073486 -0.012564883
## [41] -1.084381938 0.253995872 0.622163162 0.372832405 1.585812035
## [46] -0.441089963 -0.176820835 -0.062864145 0.234577930 -0.684381435
## [51] -1.167254099 -0.157708228 0.676462784 -0.757715106 -1.029471543
## [56] -0.518642559 -0.468960676 0.280037647 0.511469191 -1.107285697
## [61] -0.119648113 0.967421566 -0.107002537 0.202406436 1.084381938
## [66] 1.142773047 -0.075458866 -0.957438716 -0.037702589 -1.483865480
## [71] 0.189598120 -0.749370236 -0.645200916 -0.791638608 0.899434908
## [76] 0.132312852 0.138653062 0.441089963 0.547609740 -1.378918772
## [81] -0.808945725 0.107002537 -1.764224226 -0.708400243 1.861620217
## [86] -0.202406436 1.179761118 -0.490104222 -0.853316686 -1.205344920
## [91] 0.599450994 -0.260489498 0.228125248 0.997966220 -0.164072354
## [96] 0.937753841 -1.585812035 0.692343235 -0.732834875 -2.137203375
## [101] -2.079254280 0.448024745 -0.386335624 -0.947550382 0.468960676
## [106] 0.716497500 -1.347109832 -1.062033337 -0.366106357 -0.234577930
## [111] 0.319465652 0.221682051 -0.928046482 -1.040202966 1.095761965
## [116] -1.008356792 -0.584480259 1.980752397 1.465233793 -0.511469191
## [121] -0.346025823 -1.095761965 -1.273029655 -1.937931511 -0.716497500
## [126] 1.764224226 -0.006282318 0.000000000 1.681160057 0.100686285
## [131] 2.375107084 0.006282318 -0.018847945 0.518642559 -0.497200571
## [136] -0.461959623 -0.835371144 1.543097927 -1.118958381 0.569639391
## [141] 1.301806749 0.987682290 0.273510070 -0.195998259 0.406724252
## [146] -0.081760594 -1.154927051 0.195998259 -0.741077227 1.483865480
## [151] 0.540325710 -0.228125248 0.260489498 0.928046482 -0.483032470
## [156] -0.138653062 -0.406724252 -0.614557305 -0.880762990 1.018857387
## [161] -0.069160134 -0.676462784 -0.050279388 -0.359396830 1.331704246
## [166] 0.684381435 -1.018857387 0.352703444 -0.043990118 0.817690678
## [171] -0.622163162 0.844309926 -0.125977957 0.069160134 -0.326083868
## [176] 0.490104222 -0.293128990 -1.073140638 -0.208822935 0.826498615
## [181] 1.205344920 -2.203366572 -1.655888698 -0.937753841 -0.221682051
## [186] 1.008356792 -1.465233793 0.170443132 -0.525842714 -0.817690678
## [191] 0.880762990 0.346025823 0.379575363 -1.395360129 0.286577179
## [196] -0.660751127 -1.794824260 -0.144998850 -1.316608391 0.908889378
## [201] 0.208822935 -0.540325710 1.937931511 3.022583937 0.853316686
## [206] 0.420410685 1.707553094 -0.088065570 0.012564883 0.018847945
## [211] -0.562265945 -0.899434908 0.113323060 1.564098295 0.660751127
## [216] -0.286577179 0.606986835 -0.454981140 0.306270765 0.025131751
## [221] 0.144998850 1.631632667 0.176820835 1.118958381 0.504322046
## [226] -0.724642010 -0.100686285 -1.331704246 -0.533070235 0.918425797
## [231] -0.113323060 0.475984791 -0.379575363 0.554922943 0.031416549
## [236] -0.413557785 -0.241040394 -0.170443132 0.591949043 -0.393113587
## [241] 0.386335624 -1.287284949 0.088065570 -0.599450994 -1.192455456
## [246] -0.434176329 0.081760594 -1.142773047 -0.448024745 -0.427283386
## [251] 1.107285697 -0.890060160 -1.898394677 1.073140638 -1.861620217
## [256] 0.461959623 -1.543097927 1.245269831 -1.259028466 -0.987682290
## [261] 0.299693408 -1.503027005 -0.967421566 1.029471543 -1.564098295
## [266] 1.259028466 -1.179761118 1.287284949 -0.306270765 -0.606986835
## [271] 0.332716397 0.157708228 -0.132312852 -2.280865771 -1.707553094
## [276] 0.584480259 -1.608300307 0.562265945 -1.301806749 -1.735192204
## [281] 1.347109832 2.079254280 -0.554922943 -0.591949043 1.051055539
## [286] -0.668586325 -0.766113077 -0.826498615 -1.681160057 -1.429424692
## [291] -0.918425797 -0.299693408 0.749370236 1.794824260 0.957438716
## [296] -1.245269831 0.427283386 -0.183205739 0.359396830 -2.496817918
## [301] -1.827203533 0.266994125 1.447097300 0.094374049 0.037702589
## [306] -0.774565430 0.732834875 2.280865771 -1.130785550 0.483032470
## [311] -1.980752397 -0.504322046 0.652956285 1.608300307 2.203366572
## [316] -2.375107084 0.413557785 2.672947708 2.496817918 -0.273510070
## [321] 1.273029655 -0.215248044 1.735192204 -1.051055539 -2.027546869
## [326] 0.312861400 -0.569639391 0.890060160 -1.218437810 -3.022583937
## [331] 1.827203533 1.429424692 1.062033337 0.151350483 0.724642010
## [336] 1.316608391 -0.420410685 -1.522756851 0.808945725 -0.094374049
## [341] 2.137203375 1.362841938 0.525842714 0.497200571 -0.319465652
## [346] 0.393113587 -0.997966220 -0.332716397 0.708400243 1.395360129
## [351] 0.757715106 -0.637484161 0.871541335 -0.339363596 -0.280037647
## [356] 0.835371144 -0.862393206 1.378918772 -1.447097300 -0.871541335
## [361] -0.025131751 1.167254099 0.119648113 1.231742970 -0.372832405
## [366] 0.241040394 -0.844309926 1.503027005 0.766113077 0.791638608
## [371] 0.533070235 -1.412187579 0.339363596 -0.475984791 -0.189598120
## [376] 0.645200916 -0.908889378 -1.231742970 -0.399909659 0.293128990
## [381] -0.151350483 0.629805182 0.366106357 1.898394677 0.741077227
## [386] 1.522756851 -0.629805182 0.062864145 0.043990118 0.050279388
## [391] 0.862393206 -0.700349168 0.577043938 0.056570646 0.614557305
## [396] -1.362841938 -0.266994125 -0.352703444 -0.783073486
##
## $FTSE$y
## [1] 0.679325585 -0.487765222 0.906788661 0.578853627 -0.720408902
## [6] 0.855359170 0.823988102 0.083718705 -0.521808405 1.405461680
## [11] 0.959526160 0.164267835 -0.534947286 -0.227692066 0.668896321
## [16] 1.149110807 -0.285946134 -0.034876962 0.376027291 0.220136716
## [21] 0.023121387 -0.261981815 0.112021014 0.385847127 -0.626513434
## [26] -0.468012687 0.936540629 0.123200123 -1.153579943 -0.046681709
## [31] 0.603253678 0.924600565 0.321987121 0.145193336 -3.071346814
## [36] 0.551072623 1.855549031 0.810945847 0.674799848 0.000000000
## [41] -0.791456811 0.167951752 0.533495923 0.284284740 1.281324413
## [46] -0.395581430 -0.164855751 -0.048787811 0.153944355 -0.532353603
## [51] -0.844263531 -0.159647256 0.582502094 -0.609409894 -0.754055907
## [56] -0.445126631 -0.416281221 0.197398978 0.448101364 -0.799907703
## [61] -0.112424889 0.822789723 -0.084687043 0.130990908 0.873412851
## [66] 0.911622230 -0.052918053 -0.703426367 -0.038086533 -1.082069649
## [71] 0.127108851 -0.592421620 -0.514685964 -0.614594679 0.763209393
## [76] 0.085453486 0.089261458 0.376114773 0.479004906 -0.976509938
## [81] -0.628955235 0.062512209 -1.280699699 -0.537910849 1.741758460
## [86] -0.203244088 0.932127051 -0.430716697 -0.643024162 -0.851147284
## [91] 0.518237202 -0.263686095 0.149948702 0.827423168 -0.160218835
## [96] 0.806293788 -1.125994952 0.592970744 -0.585571518 -1.716013508
## [101] -1.590155420 0.385692826 -0.368033649 -0.698193627 0.404692801
## [106] 0.622913444 -0.971070200 -0.772216547 -0.345878284 -0.218990166
## [111] 0.219470786 0.148748037 -0.693126496 -0.760282509 0.874952903
## [116] -0.730411687 -0.493311037 1.810772204 1.167828994 -0.440528634
## [121] -0.323664372 -0.793291956 -0.911501492 -1.400735909 -0.538569187
## [126] 1.662829368 0.000000000 0.000000000 1.438517027 0.053747881
## [131] 3.020661157 0.000000000 -0.012033212 0.453305520 -0.435286131
## [136] -0.413123696 -0.636352652 1.248429330 -0.800672565 0.492352395
## [141] 1.052166580 0.826610500 0.177367861 -0.192791942 0.323254622
## [146] -0.058941412 -0.841393410 0.130848533 -0.590028907 1.175111536
## [151] 0.476396709 -0.215517241 0.168859219 0.799749098 -0.427815806
## [156] -0.132802125 -0.379380476 -0.502532292 -0.674742532 0.842205625
## [161] -0.051213363 -0.528162075 -0.043586797 -0.344882264 1.078006285
## [166] 0.586383314 -0.751203099 0.264122679 -0.043249194 0.684419620
## [171] -0.503965308 0.714622271 -0.116959064 0.003903201 -0.304437766
## [176] 0.434561328 -0.272861932 -0.785647280 -0.204861521 0.694800837
## [181] 0.944838672 -2.035109523 -1.153663178 -0.693859544 -0.214054927
## [186] 0.829724370 -1.063744380 0.117661379 -0.445777273 -0.635024017
## [191] 0.725112659 0.252165779 0.296158059 -0.982930184 0.204256710
## [196] -0.521831302 -1.290930700 -0.132857261 -0.943709986 0.763839342
## [201] 0.137448457 -0.457532651 1.805114491 5.590215071 0.715229729
## [206] 0.366653802 1.526629494 -0.060601470 0.000000000 0.000000000
## [211] -0.485105738 -0.685505370 0.076692998 1.272128132 0.575104048
## [216] -0.270859980 0.524330441 -0.405268490 0.214762066 0.000000000
## [221] 0.090232348 1.371046503 0.118575610 0.880861616 0.443922662
## [226] -0.562495434 -0.069791361 -0.948355082 -0.449029577 0.782822635
## [231] -0.110963160 0.418425535 -0.365057709 0.481125093 0.000000000
## [236] -0.383057090 -0.221844265 -0.163047506 0.497364709 -0.369330773
## [241] 0.307680902 -0.923907018 0.037300906 -0.499645774 -0.850665168
## [246] -0.393075818 0.026561433 -0.834566215 -0.397842470 -0.387909513
## [251] 0.875231339 -0.684172304 -1.373922414 0.862371717 -1.334726091
## [256] 0.403874054 -1.093493712 0.975282319 -0.907206820 -0.722149876
## [261] 0.214643453 -1.082817706 -0.713741529 0.848107912 -1.125305354
## [266] 1.000405022 -0.846132253 1.023214430 -0.284238761 -0.501846796
## [271] 0.229996368 0.096618357 -0.120656371 -2.073769832 -1.159587154
## [276] 0.495070100 -1.146713032 0.485782487 -0.929360283 -1.228335857
## [281] 1.081771721 2.098255667 -0.478705844 -0.497594958 0.858476413
## [286] -0.524750021 -0.610591900 -0.635239050 -1.156628533 -1.038253691
## [291] -0.692264695 -0.281434015 0.646954105 1.673856773 0.818906993
## [296] -0.900635495 0.373720644 -0.173471546 0.267017038 -2.307984952
## [301] -1.302410108 0.175361683 1.164113786 0.043260080 0.000000000
## [306] -0.614027502 0.635224504 2.978815391 -0.827070826 0.423334180
## [311] -1.454346177 -0.436326304 0.562835661 1.294539862 2.159517483
## [316] -2.151025969 0.350210970 4.440146323 3.345545312 -0.268796260
## [321] 1.011679231 -0.212683681 1.577213718 -0.770639402 -1.576316801
## [326] 0.214843750 -0.487234457 0.755973365 -0.878591144 -4.055379064
## [331] 1.720966357 1.153351551 0.862103214 0.094532850 0.629623800
## [336] 1.075394963 -0.386892096 -1.091389288 0.679337155 -0.066305238
## [341] 2.138786980 1.096675583 0.468685036 0.436401941 -0.303404877
## [346] 0.308085362 -0.726646191 -0.305614247 0.605533058 1.109731783
## [351] 0.662251656 -0.513749261 0.720734109 -0.309837335 -0.270100270
## [356] 0.712324701 -0.655713549 1.097597152 -1.060005869 -0.663577386
## [361] -0.014927601 0.925649448 0.081360947 0.968147218 -0.347679696
## [366] 0.154247310 -0.641707308 1.188367287 0.667444744 0.677511684
## [371] 0.475025191 -0.999283668 0.249629174 -0.418621436 -0.177574835
## [376] 0.555454710 -0.689580475 -0.879776057 -0.377773703 0.206170385
## [381] -0.143287530 0.548217374 0.274443794 1.802722330 0.645230670
## [386] 1.221640489 -0.513722730 0.003536818 0.000000000 0.000000000
## [391] 0.717948718 -0.537256830 0.494263019 0.000000000 0.526962937
## [396] -0.975013105 -0.268210051 -0.336164190 -0.614237529
qqline(eu_percentreturns)
par(mfrow=c(1, 1))
# Make a scatterplot of DAX and FTSE
plot(eu_stocks[,"DAX"], eu_stocks[,"FTSE"])
# Make a scatterplot matrix of eu_stocks
pairs(eu_stocks)
# Convert eu_stocks to log returns
logreturns <- diff(log(eu_stocks))
# Plot logreturns
plot(logreturns)
# Make a scatterplot matrix of logreturns
pairs(logreturns)
DAX_logreturns <- logreturns[,"DAX"]
FTSE_logreturns <- logreturns[,"FTSE"]
# Use cov() with DAX_logreturns and FTSE_logreturns
cov(DAX_logreturns, FTSE_logreturns)
## [1] 5.092401e-05
# Use cov() with logreturns
cov(logreturns)
## DAX SMI CAC FTSE
## DAX 9.883355e-05 6.840581e-05 8.373055e-05 5.092401e-05
## SMI 6.840581e-05 7.927600e-05 7.327089e-05 4.880343e-05
## CAC 8.373055e-05 7.327089e-05 1.357431e-04 6.848845e-05
## FTSE 5.092401e-05 4.880343e-05 6.848845e-05 8.353753e-05
# Use cor() with DAX_logreturns and FTSE_logreturns
cor(DAX_logreturns, FTSE_logreturns)
## [1] 0.5604406
# Use cor() with logreturns
cor(logreturns)
## DAX SMI CAC FTSE
## DAX 1.0000000 0.7728049 0.7228911 0.5604406
## SMI 0.7728049 1.0000000 0.7063203 0.5997064
## CAC 0.7228911 0.7063203 1.0000000 0.6431579
## FTSE 0.5604406 0.5997064 0.6431579 1.0000000
xData <- c( 2.07, 1.3, 0.03, -0.34, 0.23, 0.47, 4.34, 2.82, 2.91, 2.33, 1.16, 0.82, -0.24, -0.03, -1.54, -0.69, -1.42, -0.77, 0.84, 0.04, 1.07, 1.5, -0.21, 0.33, -0.75, -0.11, 0.2, -0.17, 0.87, 1.47, 0.84, 0.96, 0.67, -0.26, 0.08, -1.46, -1.27, -2.19, -2.21, 0.42, -1.02, -1.54, -0.73, 0.7, -0.36, -0.77, -0.5, 1.31, 1.16, 0.69, -0.79, 0.33, 2.01, 1.71, 1, 0.69, 0.66, 1.51, 0.86, 1.97, 2.98, 3.02, 1.3, 0.71, 0.41, -0.53, -0.21, 1.73, -0.76, -1.34, -1.72, -2.78, -1.73, -3.49, -2.42, -0.14, -0.16, -0.28, -0.97, -1.53, -1.04, -1.26, -1.44, -1.24, -0.45, 1.13, 3.26, 1.14, 0.99, 0.38, 2.71, 2.42, 1.79, -1.03, -1.07, -2.63, -2.67, -1.3, -1.04, 0.4, -0.49, -0.49, -1.08, -0.27, -1.84, -2.1, -1.89, -1.85, -0.34, -1.21, -0.5, -0.58, -1.67, -1.41, -2.55, -0.87, -2.17, -2.6, -2.06, -0.88, 1.33, 1.08, -0.96, -1.81, -2.06, -2.34, -0.01, 0.77, 0.03, 1.17, 2.68, 4.58, 4.91, 4.13, 4.04, 1.35, 0.61, 1.43, 0.79, 1.34, 2.22, 2.83, 2.43, 1.89, 0.47, -1.31, -1.46, 0.21, 1.1, 1.42 )
x <- ts(data=xData, start=c(1, 1), frequency=1)
n <- length(x)
# Define x_t0 as x[-1]
x_t0 <- x[-1]
# Define x_t1 as x[-n]
x_t1 <- x[-n]
# Confirm that x_t0 and x_t1 are (x[t], x[t-1]) pairs
head(cbind(x_t0, x_t1))
## x_t0 x_t1
## [1,] 1.30 2.07
## [2,] 0.03 1.30
## [3,] -0.34 0.03
## [4,] 0.23 -0.34
## [5,] 0.47 0.23
## [6,] 4.34 0.47
# Plot x_t0 and x_t1
plot(x_t0, x_t1)
# View the correlation between x_t0 and x_t1
cor(x_t0, x_t1)
## [1] 0.7630798
# Use acf with x
acf(x, lag.max = 1, plot = FALSE)
##
## Autocorrelations of series 'x', by lag
##
## 0 1
## 1.000 0.758
# Confirm that difference factor is (n-1)/n
cor(x_t1, x_t0) * (n-1)/n
## [1] 0.7579926
# Generate ACF estimates for x up to lag-10
acf(x, lag.max = 10, plot = FALSE)
##
## Autocorrelations of series 'x', by lag
##
## 0 1 2 3 4 5 6 7 8 9 10
## 1.000 0.758 0.537 0.345 0.226 0.198 0.140 0.114 0.124 0.118 0.100
# Type the ACF estimate at lag-10
0.1 # may differ slightly due rounding
## [1] 0.1
# Type the ACF estimate at lag-5
0.198 # may differ slightly due rounding
## [1] 0.198
xData <- c( -0.037, -0.677, -0.735, -1.531, -2.27, -1.966, -0.964, -0.525, -0.894, -0.589, 1.174, 0.237, 0.495, 0.451, -0.075, 0.394, 1.694, 0.129, -0.378, 0.683, 1.725, 1.441, 0.601, 0.057, 0.066, -1.115, -0.638, -2.109, -1.634, -0.974, -3.366, -3.009, -4.468, -4.133, -5.638, -5.004, -3.228, -2.902, -2.652, -2.295, -3.406, -2.196, -0.02, 0.008, -1.067, -0.586, 0.362, -0.791, -0.724, -0.238, -0.006, -0.887, -1.354, -2.613, -1.704, -0.967, 0.407, 1.216, 2.585, 4.095, 1.323, 2.301, 1.051, 1.035, 0.328, -0.254, 0.115, -0.096, -1.291, -2.435, -0.34, -0.161, -0.194, 0.013, 0.67, 0.258, 0.408, 0.635, 0.787, 0.211, 0.571, 1.452, 1.149, 3.41, 0.329, 0.494, -0.782, -1.251, -2.175, -1.332, -0.258, 0.696, 1.803, 1.134, 0.341, 1.206, 2.518, 1.459, -0.077, -1.048, 0.459, -0.119, 0.019, 0.481, 0.53, 3.184, 2.545, 3.264, 1.889, 1.813, 0.152, -0.589, 0.69, -0.72, -0.858, -1.287, -1.528, -1.207, -2.333, -2.767, -3.079, -1.889, -1.805, -1.725, -2.02, -1.885, -1.857, -0.569, 0.45, -0.685, 0.144, -0.459, -0.716, 0.009, -0.269, 0.408, 1.515, 1.918, 2.316, 0.864, 0.868, -0.244, -1.638, -2.346, -0.934, -0.703, -1.651, -1.456, -0.166, -0.33 )
yData <- c( -1.363, -2.007, 1.459, 5.736, -0.604, -1.295, 1.261, 5.438, -1.159, -2.092, 1.03, 5.792, -0.529, 0.499, 0.937, 4.712, 2.557, 1.319, 2.033, 4.465, 1.995, 1.54, -0.411, 4.891, 0.482, 2.582, -0.763, 5.177, 0.569, 3.998, 0.479, 3.462, -0.742, 3.582, -1.834, 3.307, 0.894, 4.393, -0.535, 3.215, 0.605, 4.754, 0.364, 2.099, 2.121, 4.177, 1.053, 2.481, 3.878, 4.343, 2.663, 1.744, 6.083, 4.762, 1.744, 2.017, 6.513, 5.345, 0.633, 3.043, 5.872, 4.106, 0.143, 2.816, 5.296, 3.718, 1.703, 2.252, 4.088, 3.576, 1.084, 0.592, 2.83, 3.034, 1.845, 0.255, 3.195, 1.867, 0.608, 2.624, 3.104, 2.17, -0.087, 3.059, 3.751, 1.832, 0.933, 4.723, 2.821, 1.332, 0.24, 4.433, 3.374, 0.928, 2.101, 4.943, 3.517, 1.842, 0.582, 4.262, 2.347, 0.123, 0.035, 5.626, 4.225, 0.695, 0.846, 6.523, 2.926, 0.766, 0.242, 5.072, 2.156, 0.569, -1.052, 4.85, 1.204, 2.729, 0.828, 1.481, -1.803, 2.223, 0.816, 1.572, -1.601, 0.099, 1.694, 1.615, -2.158, 0.272, 1.636, 1.477, -2.183, 0.722, 1.851, 0.814, -1.248, 0.496, 2.982, 1.452, -1.673, 0.229, 2.828, 2.407, -0.046, 1.626, 5.61, 2.945, -0.771, 0.444 )
zData <- c( 0.316, 1.735, -0.009, 0.814, -0.929, -1.153, 0.863, 0.531, -1.166, -1.813, 1.612, 0.027, -0.441, 0.522, 0.67, 0.661, -0.603, 0.311, -0.495, -1.107, 0.571, -1.002, 0.257, 0.329, -1.939, -0.857, -1.363, -0.572, 0.805, -0.496, 0.174, -0.504, 0.131, 0.421, -0.229, -0.578, -0.469, 0.364, -0.866, 0.423, 0.464, -0.792, -0.764, -0.55, 0.566, 0.145, 0.483, 0.475, -0.17, 1.205, 0.776, -0.033, 0.118, 0.234, 0.127, 0.95, 0.448, -0.959, 1.425, 0.502, -2.396, 0.047, -0.168, 0.663, 0.181, 0.22, -1.99, 1.079, -0.868, 0.686, 0.482, -2.113, 1.368, 1.464, 0.072, 0.302, -1.101, 0.116, -0.043, 0.137, 0.362, -0.192, -0.305, 3.129, -0.378, 0.717, -0.711, 0.181, 0.689, 0.816, -0.799, 0.044, 0.54, -0.622, 0.545, -0.365, -0.759, -1.492, -1.17, -1.567, -1.613, 1.255, -0.322, 1.431, -0.316, 0.166, 0.194, -0.799, -1.252, -2.43, 0.18, -0.308, 0.504, -0.442, -0.364, -2.189, 0.526, -0.485, 0.211, -0.097, -0.966, 0.016, -0.06, -0.155, 0.101, 0.062, -0.735, -0.318, 1.038, 1.085, 0.691, 0.86, 0.432, 1.346, 1.928, 0.015, 0.971, 0.305, -0.772, -1.538, -1.304, -0.64, 1.134, 0.03, 0.739, 1.925, 0.988, 1.01, -0.214, 1.478 )
x <- ts(data=xData, start=c(1, 1), frequency=1)
y <- ts(data=yData, start=c(1, 1), frequency=1)
z <- ts(data=zData, start=c(1, 1), frequency=1)
plot(cbind(x, y, z))
# View the ACF of x
acf(x)
# View the ACF of y
acf(y)
# View the ACF of z
acf(z)
Chapter 4 - Autoregression
Autoregressive Model - where current observations are highly dependent on previous observations:
AR Model Estimation and Forecasting - example from Mishkin data in package Ecdat:
Example code includes:
# Simulate an AR model with 0.5 slope
x <- arima.sim(model = list(ar=0.5), n = 100)
# Simulate an AR model with 0.9 slope
y <- arima.sim(model = list(ar=0.9), n = 100)
# Simulate an AR model with -0.75 slope
z <- arima.sim(model = list(ar=-0.75), n = 100)
# Plot your simulated data
plot.ts(cbind(x, y, z))
# Calculate the ACF for x
acf(x)
# Calculate the ACF for y
acf(y)
# Calculate the ACF for z
acf(z)
# Simulate and plot AR model with slope 0.9
x <- arima.sim(model = list(ar=0.9), n = 200)
ts.plot(x)
acf(x)
# Simulate and plot AR model with slope 0.98
y <- arima.sim(model = list(ar=0.98), n = 200)
ts.plot(y)
acf(y)
# Simulate and plot RW model
z <- arima.sim(model = list(order=c(0, 1, 0)), n = 200)
ts.plot(z)
acf(z)
xData <- c( 0.829, 0.458, 0.053, 0.063, -0.736, -0.568, -0.056, -0.148, -0.461, -0.757, -1.571, -0.231, -1.261, -0.738, -0.75, -1.921, -2.473, -3.552, -1.912, -4.195, -2.818, -3.139, -1.296, -0.796, 0.83, -0.21, -0.313, 0.059, 1.527, 3.761, 3.255, 2.586, 1.214, 1.49, 2.389, 3.566, 3.843, 4.94, 4.685, 3.247, 2.398, 2.107, 1.644, -0.185, -1.972, -0.343, -2.117, -2.693, -2.261, -2.456, -2.08, -2.385, -1.553, -2.665, -3.956, -2.091, -1.692, -1.303, -2.698, -2.093, -2.658, -2.572, -1.599, -1.713, -1.587, -1.103, -1.194, -1.333, -0.3, -0.218, 1.675, 1.199, 1.165, 1.657, -0.531, -0.923, -0.912, -0.691, -0.517, -0.811, 1.785, 3.082, 1.498, 1.814, 2.774, 2.592, 2.433, 0.699, -0.315, -1.049, 1.062, 1.694, 2.755, 1.546, 0.908, 2.491, 1.926, -0.296, -0.731, -1.395 )
x <- ts(data=xData, start=c(1, 1), frequency=1)
str(x)
## Time-Series [1:100] from 1 to 100: 0.829 0.458 0.053 0.063 -0.736 -0.568 -0.056 -0.148 -0.461 -0.757 ...
# Fit the AR model to x
arima(x, order = c(1, 0, 0))
##
## Call:
## arima(x = x, order = c(1, 0, 0))
##
## Coefficients:
## ar1 intercept
## 0.8575 -0.0948
## s.e. 0.0491 0.6703
##
## sigma^2 estimated as 1.022: log likelihood = -143.66, aic = 293.32
# Copy and paste the slope (ar1) estimate
0.8575 #
## [1] 0.8575
# Copy and paste the slope mean (intercept) estimate
-0.0948 #
## [1] -0.0948
# Copy and paste the innovation variance (sigma^2) estimate
1.022 #
## [1] 1.022
data(AirPassengers, package="datasets")
# Fit the AR model to AirPassengers
AR <- arima(AirPassengers, order = c(1, 0, 0))
print(AR)
##
## Call:
## arima(x = AirPassengers, order = c(1, 0, 0))
##
## Coefficients:
## ar1 intercept
## 0.9646 278.4649
## s.e. 0.0214 67.1141
##
## sigma^2 estimated as 1119: log likelihood = -711.09, aic = 1428.18
# Run the following commands to plot the series and fitted values
ts.plot(AirPassengers)
AR_fitted <- AirPassengers - residuals(AR)
points(AR_fitted, type = "l", col = 2, lty = 2)
data(Nile, package="datasets")
# Fit an AR model to Nile
AR_fit <-arima(Nile, order = c(1, 0, 0))
print(AR_fit)
##
## Call:
## arima(x = Nile, order = c(1, 0, 0))
##
## Coefficients:
## ar1 intercept
## 0.5063 919.5685
## s.e. 0.0867 29.1410
##
## sigma^2 estimated as 21125: log likelihood = -639.95, aic = 1285.9
# Use predict() to make a 1-step forecast
predict_AR <- predict(AR_fit)
# Obtain the 1-step forecast using $pred[1]
predict(AR_fit)$pred[1]
## [1] 828.6576
# Use predict to make 1-step through 10-step forecasts
predict(AR_fit, n.ahead = 10)
## $pred
## Time Series:
## Start = 1971
## End = 1980
## Frequency = 1
## [1] 828.6576 873.5426 896.2668 907.7715 913.5960 916.5448 918.0377
## [8] 918.7935 919.1762 919.3699
##
## $se
## Time Series:
## Start = 1971
## End = 1980
## Frequency = 1
## [1] 145.3439 162.9092 167.1145 168.1754 168.4463 168.5156 168.5334
## [8] 168.5380 168.5391 168.5394
# Run to plot the Nile series plus the forecast and 95% prediction intervals
ts.plot(Nile, xlim = c(1871, 1980))
AR_forecast <- predict(AR_fit, n.ahead = 10)$pred
AR_forecast_se <- predict(AR_fit, n.ahead = 10)$se
points(AR_forecast, type = "l", col = 2)
points(AR_forecast - 2*AR_forecast_se, type = "l", col = 2, lty = 2)
points(AR_forecast + 2*AR_forecast_se, type = "l", col = 2, lty = 2)
Chapter 5 - Simple Moving Average
Simple Moving Average Model - weighted average of current and previous noise:
MA Model Estimation and Forecasting - inflation data available in Ecdat::Mishkin:
Compute the AR and MA models - differences and implications for usage:
Example code includes:
# Generate MA model with slope 0.5
x <- arima.sim(model = list(ma=0.5), n = 100)
# Generate MA model with slope 0.9
y <- arima.sim(model = list(ma=0.9), n = 100)
# Generate MA model with slope -0.5
z <- arima.sim(model = list(ma=-0.5), n = 100)
# Plot all three models together
plot.ts(cbind(x, y, z))
# Calculate ACF for x
acf(x)
# Calculate ACF for y
acf(y)
# Calculate ACF for z
acf(z)
xData <- c( -0.291, 0.378, -0.413, 0.791, 2.626, 1.955, 1.321, -0.563, -1.005, -1.945, -1.3, -0.968, -1.621, -0.247, -0.911, -0.036, 0.203, 0.323, 1.032, -0.066, 1.104, 3.577, 1.925, 0.255, 0.092, 0.832, 0.578, -1.189, -0.927, -0.288, 0.092, -0.248, -1.739, 0.599, 1.404, 1.942, 2.002, 2.473, 2.005, -0.547, -0.085, 0.055, 1.08, 0.091, 0.038, 1.062, -0.571, -0.149, -0.297, -2.916, -0.892, 0.064, -1.894, -0.821, 0.296, 1.245, 2.076, 0.82, -0.445, -0.619, -0.308, -0.779, -0.619, 0.541, 0.313, -0.416, -0.637, -1.198, 0.382, 0.011, -0.55, 0.272, -1.323, -1.865, -1.996, 0.091, -1.318, -1.269, 0.259, 0.987, 1.746, 1.88, 0.435, -0.986, 0.229, 1.781, 3.713, 2.018, -0.461, -1.422, -0.604, 1.405, 2.359, 1.908, 2.052, 1.572, -0.755, -1.396, -0.522, -0.298 )
x <- ts(data=xData, start=c(1, 1), frequency=1)
str(x)
## Time-Series [1:100] from 1 to 100: -0.291 0.378 -0.413 0.791 2.626 ...
# Fit the MA model to x
arima(x, order = c(0, 0, 1))
##
## Call:
## arima(x = x, order = c(0, 0, 1))
##
## Coefficients:
## ma1 intercept
## 0.7927 0.1590
## s.e. 0.0902 0.1747
##
## sigma^2 estimated as 0.9576: log likelihood = -140.22, aic = 286.45
# Paste the slope (ma1) estimate below
0.7928 #
## [1] 0.7928
# Paste the slope mean (intercept) estimate below
0.1589 #
## [1] 0.1589
# Paste the innovation variance (sigma^2) estimate below
0.9576 #
## [1] 0.9576
# Fit the MA model to Nile
MA <- arima(Nile, order = c(0, 0, 1))
print(MA)
##
## Call:
## arima(x = Nile, order = c(0, 0, 1))
##
## Coefficients:
## ma1 intercept
## 0.3783 919.2433
## s.e. 0.0791 20.9685
##
## sigma^2 estimated as 23272: log likelihood = -644.72, aic = 1295.44
# Plot Nile and MA_fit
ts.plot(Nile)
MA_fit <- Nile - resid(MA)
points(MA_fit, type = "l", col = 2, lty = 2)
# Make a 1-step forecast based on MA
predict_MA <- predict(MA)
# Obtain the 1-step forecast using $pred[1]
predict_MA$pred[1]
## [1] 868.8747
# Make a 1-step through 10-step forecast based on MA
predict(MA, n.ahead=10)
## $pred
## Time Series:
## Start = 1971
## End = 1980
## Frequency = 1
## [1] 868.8747 919.2433 919.2433 919.2433 919.2433 919.2433 919.2433
## [8] 919.2433 919.2433 919.2433
##
## $se
## Time Series:
## Start = 1971
## End = 1980
## Frequency = 1
## [1] 152.5508 163.1006 163.1006 163.1006 163.1006 163.1006 163.1006
## [8] 163.1006 163.1006 163.1006
# Plot the Nile series plus the forecast and 95% prediction intervals
ts.plot(Nile, xlim = c(1871, 1980))
MA_forecasts <- predict(MA, n.ahead = 10)$pred
MA_forecast_se <- predict(MA, n.ahead = 10)$se
points(MA_forecasts, type = "l", col = 2)
points(MA_forecasts - 2*MA_forecast_se, type = "l", col = 2, lty = 2)
points(MA_forecasts + 2*MA_forecast_se, type = "l", col = 2, lty = 2)
# These should actually be from fitting MA and AR to the Nile data
ARFitData <- c( 947.15, 1021.04, 1041.29, 941.56, 1066.61, 1041.29, 1041.29, 865.62, 1076.73, 1147.61, 1031.17, 957.76, 927.38, 1015.98, 957.25, 970.41, 940.04, 1051.42, 858.53, 939.03, 1031.17, 1010.92, 1066.61, 1036.23, 1086.86, 1091.92, 1071.67, 975.48, 1010.92, 845.87, 879.29, 896.5, 805.37, 929.91, 875.74, 808.91, 917.76, 804.36, 970.41, 985.6, 944.59, 874.73, 821.57, 684.88, 871.18, 809.42, 1021.04, 1010.92, 875.23, 840.81, 869.67, 842.83, 881.82, 891.44, 890.42, 807.39, 881.82, 830.68, 857.01, 980.54, 838.28, 849.41, 891.94, 881.82, 931.94, 952.19, 908.14, 870.17, 965.35, 844.35, 796.26, 782.59, 882.32, 865.11, 829.67, 859.54, 980.54, 889.41, 896.5, 883.34, 904.6, 830.68, 833.21, 878.27, 985.6, 918.77, 953.2, 857.52, 921.31, 947.63, 866.63, 970.41, 912.7, 910.17, 1046.36, 915.74, 831.7, 919.28, 817.52, 815.49 )
MAFitData <- c( 932.23, 987.22, 984, 911.36, 1032.19, 967.59, 992.03, 851.52, 1062.41, 1035.6, 958.74, 932.96, 920.01, 991.11, 920.34, 956.94, 920.4, 1017.44, 836.61, 965.16, 985.38, 962.6, 1012.83, 971.13, 1024.73, 1008.24, 999.35, 930.84, 983.23, 840.1, 919.21, 902.14, 840.51, 956.88, 872.38, 854.41, 942.54, 824.47, 993.21, 940.73, 929.94, 881.82, 860.3, 766.31, 941.07, 828.81, 1029.39, 945.95, 876.14, 876.82, 898.13, 870.02, 909.78, 901.93, 904.14, 841.27, 920.66, 852.42, 897.9, 973, 838.29, 897.57, 906.92, 895.82, 937.47, 936.84, 904.17, 888.16, 965.33, 845.73, 855.04, 841.3, 921.02, 878, 867.8, 893.98, 974.48, 875.94, 918.51, 892.57, 918.27, 853.32, 879.78, 903.44, 974.68, 897.8, 952.61, 860.38, 942.93, 931.37, 875.22, 974.01, 893.52, 922.07, 1013.03, 881.03, 868.17, 938.47, 835.84, 873.15 )
AR_fit <- ts(data=ARFitData, start=c(1871, 1), frequency=1)
MA_fit <- ts(data=MAFitData, start=c(1871, 1), frequency=1)
# Find correlation between AR_fit and MA_fit
cor(AR_fit, MA_fit)
## [1] 0.9401758
# Need to create AR and MA, though the MA model is probably already OK from exercises above
# Find AIC of AR
AIC(AR)
## [1] 1428.179
# Find AIC of MA
AIC(MA)
## [1] 1295.442
# Find BIC of AR
BIC(AR)
## [1] 1437.089
# Find BIC of MA
BIC(MA)
## [1] 1303.257
Chapter 1 - Time Series Data and Models
Time series is a sequence of data in chronological order (recorded sequentially over time), especially common in finance and economics:
Stationarity and Non-Stationarity - definitions, and conversions from non-stationarity to stationarity:
Stationary Time Series - ARMA:
Example code includes:
data(AirPassengers, package="datasets")
data(djia, package="astsa")
data(soi, package="astsa")
# View a detailed description of AirPassengers
help(AirPassengers)
## starting httpd help server ...
## done
# Plot AirPassengers
plot(AirPassengers)
# Plot the DJIA daily closings
plot(djia[,"Close"])
# Plot the Southern Oscillation Index
plot(soi)
yData <- c( 1.0752, -1.2233, -0.8066, 2.2028, -0.1881, 0.909, -1.197, -0.6968, 1.1385, -3.7502, 3.2141, -3.4124, -0.5707, 2.4628, 0.8797, 2.647, 3.3487, 2.1274, 1.4951, -1.0343, -0.2178, 2.5329, -0.3333, -1.1314, 3.4232, -2.6573, 2.3444, 5.107, 2.7611, 0.2877, -1.4333, 2.9236, 0.1324, 4.2033, 0.1539, -0.4517, 5.2934, 0.9239, 6.3714, 6.8761, 2.6617, 4.1279, 6.1697, 2.6619, 2.3581, 8.5626, 3.6387, 3.0449, 1.5867, 5.2176, 5.6889, 2.4215, 3.6722, 3.6326, 4.4526, 5.3535, 6.808, 5.5121, 6.7058, 3.7262, 9.6174, 7.8367, 5.1775, 5.8864, 4.2734, 12.0168, 5.0889, 6.2802, 4.2652, 4.162, 5.9201, 8.9842, 13.745, 9.4167, 8.9174, 7.543, 6.2326, 9.2702, 8.9234, 9.2996, 6.5795, 9.4189, 8.9092, 10.9316, 9.9733, 7.8103, 10.2368, 10.29, 8.6811, 10.3147, 6.7295, 12.7876, 5.988, 9.3356, 10.5408, 10.1422, 10.2608, 9.0473, 11.5869, 13.5886, 9.4664, 7.4157, 11.0767, 14.2901, 11.2511, 11.6835, 11.5153, 9.0543, 11.5185, 11.4878, 9.0081, 11.8876, 10.8354, 8.4025, 11.3758, 10.3381, 10.4919, 14.8334, 11.638, 12.1553, 14.1939, 13.2541, 9.6846, 12.8065, 14.3461, 12.9815, 11.5454, 12.7671, 12.6851, 11.4467, 12.9778, 12.6478, 15.6949, 12.0763, 12.1423, 13.4401, 15.3413, 14.4367, 13.863, 13.1309, 10.9893, 12.3688, 13.5126, 14.678, 15.2781, 15.5538, 14.0693, 14.6665, 15.6628, 14.0735, 15.6187, 14.4782, 15.2514, 13.011, 11.4298, 20.1918, 19.0593, 16.7098, 15.6343, 11.2168, 18.6198, 15.2306, 17.6491, 16.8749, 17.8477, 15.4435, 19.3254, 19.3206, 15.1768, 17.6434, 13.9196, 20.696, 21.2888, 16.4249, 20.2915, 17.4472, 15.4037, 18.6493, 17.7711, 18.5901, 18.5847, 18.4996, 20.1874, 21.1373, 18.3648, 19.7737, 20.3995, 19.5494, 19.2275, 18.8669, 20.7898, 22.0548, 20.5807, 19.3122, 16.1878, 16.5707, 18.108, 22.0924, 22.4979, 19.8109, 21.9049, 24.0603, 20.8068, 23.1255, 20.6354, 23.8614, 17.866, 20.3238, 17.4633, 19.1253, 19.322, 22.6845, 21.8192, 18.6206, 24.9521, 21.9321, 18.4697, 19.5132, 22.2926, 21.4382, 25.9301, 17.8538, 20.7046, 22.3747, 21.0983, 25.7179, 19.8315, 27.5421, 20.7885, 17.8304, 23.0441, 21.0823, 21.6648, 24.2464, 25.5073, 23.7694, 25.6801, 22.9365, 26.6749, 26.6338, 24.3009, 25.5076, 26.2825, 23.9235, 25.9379, 26.9582, 24.2888, 24.6939, 28.6157, 26.6019 )
xData <- c( 2.9859, -6.3616, -0.1457, 4.9285, 3.2626, 3.6556, 4.519, 9.9376, 11.754, 2.3091, 4.4596, -3.359, 3.1244, 4.3235, 3.3884, -1.369, -5.1293, 0.5116, 6.1125, 15.3293, 9.6873, 9.862, 15.9674, 16.3417, 20.5944, 20.2246, 22.4165, 23.8751, 19.2596, 12.6268, 3.4223, 7.8371, 13.6312, 17.4746, 15.231, 17.7947, 12.092, 10.4566, 7.8127, 14.7825, 11.1885, 23.8849, 30.7432, 33.85, 33.4494, 27.2179, 23.1117, 27.1605, 20.3911, 21.1012, 19.1438, 20.0941, 16.1906, 13.7102, 14.6144, 14.9335, 29.1133, 31.3782, 32.7828, 30.4111, 28.2442, 29.0585, 35.9782, 34.9491, 38.223, 31.3179, 29.1704, 22.3349, 16.5423, 23.9608, 20.8017, 19.3039, 19.1387, 13.0404, 9.8801, 3.2505, -4.1992, -7.9626, -4.5083, -6.2854, -2.453, -4.7119, 1.6309, 1.1959, 5.2831, 5.15, 3.72, 0.6658, 2.7384, 8.747, 8.2221, 18.663, 11.3843, 10.3179, 21.0908, 25.0415, 24.7982, 34.6863, 26.3264, 23.3543, 23.7712, 22.7445, 29.2034, 30.2059, 36.2288, 37.6518, 36.3735, 39.842, 27.8231, 26.5969, 26.9149, 24.3732, 28.5127, 26.7399, 30.4023, 39.5915, 44.8034, 44.099, 40.2248, 42.9846, 40.8308, 42.4046, 41.4261, 40.459, 27.9815, 40.4637, 44.3681, 47.9082, 49.0735, 48.4331, 49.8923, 61.6028, 63.6814, 72.3463, 71.1518, 74.7257, 79.1934, 83.1976, 74.4918, 72.1001, 66.1204, 63.7527, 63.148, 67.4173, 74.2575, 68.8726, 68.1953, 70.0591, 71.8744, 73.2482, 79.2107, 78.5204, 87.2619, 87.7628, 91.3676, 93.3275, 97.5043, 103.3569, 94.6093, 91.3573, 85.871, 86.2847, 86.2251, 84.2668, 86.9466, 92.0229, 82.0012, 88.6786, 85.3663, 88.9641, 96.0459, 96.2658, 90.9596, 88.4945, 95.4932, 92.919, 88.7586, 91.0783, 92.4792, 93.5653, 94.3455, 87.9873, 88.7311, 102.6294, 96.466, 92.2194, 91.9247, 84.9855, 90.2585, 82.241, 89.7112, 86.6858, 85.9218, 95.0793, 95.0479, 101.2393, 99.3097, 94.1683, 96.0313, 91.7769, 91.129, 95.5681, 101.2689, 100.3594, 103.8543, 97.5836, 98.9271, 103.799, 105.883, 102.1103, 105.8276, 107.9296, 101.8401, 107.2261, 106.4817, 111.6719, 116.1099, 115.1661, 115.6657, 115.8189, 120.278, 118.6835, 109.1592, 109.7436, 117.1348, 114.0379, 116.9896, 113.5988, 111.9652, 114.1912, 108.2102, 105.3345, 108.2169, 112.0761, 102.6672, 112.187, 113.2779, 112.4105, 103.1019, 98.7301, 103.9845, 97.909, 104.8979, 108.135, 103.5588, 102.4043, 102.0028, 100.3617, 97.9829, 89.8509 )
y <- ts(data=yData, frequency=1, start=c(1, 1)) # trend stationary
x <- ts(data=xData, frequency=1, start=c(1, 1)) # random walk
plot(cbind(y, x))
# Plot detrended y (trend stationary)
plot(diff(y))
# Plot detrended x (random walk)
plot(diff(x))
data(globtemp, package="astsa")
data(cmort, package="astsa")
# Plot globtemp and detrended globtemp
par(mfrow = c(2,1))
plot(globtemp)
plot(diff(globtemp))
# Plot cmort and detrended cmort
par(mfrow = c(2,1))
plot(cmort)
plot(diff(cmort))
par(mfrow=c(1, 1))
data(gnp, package="astsa")
# Plot GNP series (gnp) and its growth rate
par(mfrow = c(2,1))
plot(gnp)
plot(diff(log(gnp)))
# Plot DJIA closings (djia$Close) and its returns
par(mfrow = c(2,1))
plot(djia[,"Close"])
plot(diff(log(djia[,"Close"])))
par(mfrow=c(1, 1))
# Generate and plot white noise
WN <- arima.sim(model=list(order=c(0, 0, 0)), n=200)
plot(WN)
# Generate and plot an MA(1) with parameter .9
MA <- arima.sim(model=list(order=c(0, 0, 1), ma=0.9), n=200)
plot(MA)
# Generate and plot an AR(2) with parameters 1.5 and -.75
AR <- arima.sim(model=list(order=c(2, 0, 0), ar=c(1.5, -.75)), n=200)
plot(AR)
Chapter 2 - Fitting ARMA Models
AR and MA Models have many visual similarities - cannot necessarily distinguish visually:
AR and MA together make an ARMA model - typical for time series, since they are frequently correlated:
Model Choice and Residual Analysis - frequently a good idea to fit several models and then select the best:
Example code includes:
# Generate 100 observations from the AR(1) model
x <- arima.sim(model = list(order = c(1, 0, 0), ar = .9), n = 100)
# Plot the generated data
plot(x)
# Plot the sample P/ACF pair
astsa::acf2(x)
## ACF PACF
## [1,] 0.90 0.90
## [2,] 0.81 0.00
## [3,] 0.74 0.05
## [4,] 0.67 -0.03
## [5,] 0.63 0.12
## [6,] 0.56 -0.15
## [7,] 0.49 -0.03
## [8,] 0.43 -0.03
## [9,] 0.38 0.01
## [10,] 0.33 0.01
## [11,] 0.31 0.08
## [12,] 0.27 -0.06
## [13,] 0.26 0.13
## [14,] 0.23 -0.11
## [15,] 0.20 -0.02
## [16,] 0.19 0.04
## [17,] 0.18 0.05
## [18,] 0.17 0.00
## [19,] 0.17 0.01
## [20,] 0.14 -0.08
# Fit an AR(1) to the data and examine the t-table
astsa::sarima(x, p=1, d=0, q=0)
## initial value 0.954248
## iter 2 value 0.096955
## iter 3 value 0.096719
## iter 4 value 0.096672
## iter 5 value 0.096292
## iter 6 value 0.096291
## iter 7 value 0.096291
## iter 7 value 0.096291
## iter 7 value 0.096291
## final value 0.096291
## converged
## initial value 0.106924
## iter 2 value 0.106764
## iter 3 value 0.105928
## iter 4 value 0.105850
## iter 5 value 0.105793
## iter 6 value 0.105759
## iter 7 value 0.105755
## iter 8 value 0.105746
## iter 9 value 0.105746
## iter 10 value 0.105744
## iter 11 value 0.105744
## iter 11 value 0.105744
## final value 0.105744
## converged
## $fit
##
## Call:
## stats::arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D,
## Q), period = S), xreg = xmean, include.mean = FALSE, optim.control = list(trace = trc,
## REPORT = 1, reltol = tol))
##
## Coefficients:
## ar1 xmean
## 0.9022 0.7655
## s.e. 0.0408 1.0371
##
## sigma^2 estimated as 1.215: log likelihood = -152.47, aic = 310.94
##
## $degrees_of_freedom
## [1] 98
##
## $ttable
## Estimate SE t.value p.value
## ar1 0.9022 0.0408 22.1400 0.0000
## xmean 0.7655 1.0371 0.7382 0.4622
##
## $AIC
## [1] 1.23467
##
## $AICc
## [1] 1.25717
##
## $BIC
## [1] 0.2867731
x <- arima.sim(model = list(order = c(2, 0, 0), ar = c(1.5, -.75)), n = 200)
# Plot x
plot(x)
# Plot the sample P/ACF of x
astsa::acf2(x)
## ACF PACF
## [1,] 0.86 0.86
## [2,] 0.55 -0.70
## [3,] 0.19 -0.12
## [4,] -0.14 -0.03
## [5,] -0.35 0.09
## [6,] -0.42 0.03
## [7,] -0.35 0.10
## [8,] -0.17 0.07
## [9,] 0.04 0.04
## [10,] 0.24 0.07
## [11,] 0.36 0.00
## [12,] 0.38 -0.06
## [13,] 0.31 0.07
## [14,] 0.16 -0.11
## [15,] -0.02 -0.05
## [16,] -0.18 0.02
## [17,] -0.28 0.01
## [18,] -0.32 -0.18
## [19,] -0.29 -0.03
## [20,] -0.22 -0.05
## [21,] -0.12 0.04
## [22,] -0.02 -0.01
## [23,] 0.07 -0.03
## [24,] 0.13 0.07
## [25,] 0.15 0.02
# Fit an AR(2) to the data and examine the t-table
astsa::sarima(x, p=2, d=0, q=0)
## initial value 1.052921
## iter 2 value 0.923863
## iter 3 value 0.502476
## iter 4 value 0.271409
## iter 5 value 0.076147
## iter 6 value 0.045722
## iter 7 value 0.028238
## iter 8 value 0.027774
## iter 9 value 0.027763
## iter 10 value 0.027731
## iter 11 value 0.027687
## iter 12 value 0.027686
## iter 13 value 0.027684
## iter 14 value 0.027684
## iter 14 value 0.027684
## iter 14 value 0.027684
## final value 0.027684
## converged
## initial value 0.030783
## iter 2 value 0.030756
## iter 3 value 0.030747
## iter 4 value 0.030747
## iter 5 value 0.030747
## iter 6 value 0.030747
## iter 7 value 0.030747
## iter 7 value 0.030747
## iter 7 value 0.030747
## final value 0.030747
## converged
## $fit
##
## Call:
## stats::arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D,
## Q), period = S), xreg = xmean, include.mean = FALSE, optim.control = list(trace = trc,
## REPORT = 1, reltol = tol))
##
## Coefficients:
## ar1 ar2 xmean
## 1.4674 -0.7081 -0.2833
## s.e. 0.0493 0.0491 0.3006
##
## sigma^2 estimated as 1.049: log likelihood = -289.94, aic = 587.87
##
## $degrees_of_freedom
## [1] 197
##
## $ttable
## Estimate SE t.value p.value
## ar1 1.4674 0.0493 29.7808 0.0000
## ar2 -0.7081 0.0491 -14.4164 0.0000
## xmean -0.2833 0.3006 -0.9425 0.3471
##
## $AIC
## [1] 1.077838
##
## $AICc
## [1] 1.088863
##
## $BIC
## [1] 0.1273124
x <- arima.sim(model = list(order = c(0, 0, 1), ma = -.8), n = 100)
# Plot x
plot(x)
# Plot the sample P/ACF of x
astsa::acf2(x)
## ACF PACF
## [1,] -0.56 -0.56
## [2,] 0.10 -0.30
## [3,] -0.07 -0.27
## [4,] 0.03 -0.22
## [5,] 0.00 -0.17
## [6,] 0.12 0.07
## [7,] -0.14 0.01
## [8,] 0.10 0.10
## [9,] -0.17 -0.11
## [10,] 0.01 -0.34
## [11,] 0.25 0.02
## [12,] -0.18 0.01
## [13,] -0.01 -0.08
## [14,] 0.05 0.03
## [15,] -0.07 0.01
## [16,] 0.07 -0.01
## [17,] -0.03 -0.09
## [18,] 0.03 -0.01
## [19,] -0.04 -0.06
## [20,] -0.03 -0.03
# Fit an MA(1) to the data and examine the t-table
astsa::sarima(x, p=0, d=0, q=1)
## initial value 0.307876
## iter 2 value 0.064846
## iter 3 value 0.022236
## iter 4 value 0.018361
## iter 5 value 0.005184
## iter 6 value 0.002722
## iter 7 value 0.002318
## iter 8 value 0.002049
## iter 9 value 0.002001
## iter 10 value 0.002001
## iter 11 value 0.002001
## iter 11 value 0.002001
## final value 0.002001
## converged
## initial value 0.007997
## iter 2 value 0.007973
## iter 3 value 0.007917
## iter 4 value 0.007907
## iter 5 value 0.007907
## iter 5 value 0.007907
## iter 5 value 0.007907
## final value 0.007907
## converged
## $fit
##
## Call:
## stats::arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D,
## Q), period = S), xreg = xmean, include.mean = FALSE, optim.control = list(trace = trc,
## REPORT = 1, reltol = tol))
##
## Coefficients:
## ma1 xmean
## -0.8526 -0.0272
## s.e. 0.0586 0.0158
##
## sigma^2 estimated as 1.003: log likelihood = -142.68, aic = 291.37
##
## $degrees_of_freedom
## [1] 98
##
## $ttable
## Estimate SE t.value p.value
## ma1 -0.8526 0.0586 -14.5486 0.0000
## xmean -0.0272 0.0158 -1.7159 0.0893
##
## $AIC
## [1] 1.042832
##
## $AICc
## [1] 1.065332
##
## $BIC
## [1] 0.09493556
x <- arima.sim(model = list(order = c(2, 0, 1), ar = c(1, -.9), ma = .8), n = 250)
# Plot x
plot(x)
# Plot the sample P/ACF of x
astsa::acf2(x)
## ACF PACF
## [1,] 0.55 0.55
## [2,] -0.34 -0.92
## [3,] -0.82 0.40
## [4,] -0.50 -0.15
## [5,] 0.23 0.03
## [6,] 0.66 -0.13
## [7,] 0.42 0.04
## [8,] -0.18 -0.05
## [9,] -0.56 -0.07
## [10,] -0.39 -0.01
## [11,] 0.12 -0.03
## [12,] 0.45 -0.08
## [13,] 0.32 -0.06
## [14,] -0.10 0.00
## [15,] -0.39 -0.06
## [16,] -0.29 -0.03
## [17,] 0.07 -0.04
## [18,] 0.31 -0.10
## [19,] 0.22 -0.04
## [20,] -0.07 0.02
## [21,] -0.26 0.04
## [22,] -0.15 0.04
## [23,] 0.12 -0.05
## [24,] 0.24 -0.07
## [25,] 0.09 0.03
## [26,] -0.14 0.11
# Fit an ARMA(2,1) to the data and examine the t-table
astsa::sarima(x, p=2, d=0, q=1)
## initial value 1.386790
## iter 2 value 0.582732
## iter 3 value 0.355625
## iter 4 value 0.116850
## iter 5 value 0.081182
## iter 6 value 0.039685
## iter 7 value 0.017575
## iter 8 value 0.012577
## iter 9 value 0.004440
## iter 10 value 0.001066
## iter 11 value -0.000395
## iter 12 value -0.000572
## iter 13 value -0.000603
## iter 14 value -0.000615
## iter 15 value -0.000633
## iter 16 value -0.000640
## iter 17 value -0.000640
## iter 18 value -0.000640
## iter 19 value -0.000640
## iter 20 value -0.000640
## iter 20 value -0.000640
## final value -0.000640
## converged
## initial value 0.011587
## iter 2 value 0.011553
## iter 3 value 0.011504
## iter 4 value 0.011479
## iter 5 value 0.011476
## iter 6 value 0.011472
## iter 7 value 0.011472
## iter 8 value 0.011472
## iter 8 value 0.011472
## iter 8 value 0.011472
## final value 0.011472
## converged
## $fit
##
## Call:
## stats::arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D,
## Q), period = S), xreg = xmean, include.mean = FALSE, optim.control = list(trace = trc,
## REPORT = 1, reltol = tol))
##
## Coefficients:
## ar1 ar2 ma1 xmean
## 0.9860 -0.8837 0.8254 -0.1482
## s.e. 0.0297 0.0293 0.0443 0.1287
##
## sigma^2 estimated as 0.9979: log likelihood = -357.6, aic = 725.21
##
## $degrees_of_freedom
## [1] 246
##
## $ttable
## Estimate SE t.value p.value
## ar1 0.9860 0.0297 33.1548 0.0000
## ar2 -0.8837 0.0293 -30.1650 0.0000
## ma1 0.8254 0.0443 18.6169 0.0000
## xmean -0.1482 0.1287 -1.1511 0.2508
##
## $AIC
## [1] 1.029889
##
## $AICc
## [1] 1.038872
##
## $BIC
## [1] 0.08623193
data (varve, package="astsa")
dl_varve <- diff(log(varve))
# Fit an MA(1) to dl_varve.
astsa::sarima(dl_varve, p=0, d=0, q=1)
## initial value -0.551780
## iter 2 value -0.671633
## iter 3 value -0.706234
## iter 4 value -0.707586
## iter 5 value -0.718543
## iter 6 value -0.719692
## iter 7 value -0.721967
## iter 8 value -0.722970
## iter 9 value -0.723231
## iter 10 value -0.723247
## iter 11 value -0.723248
## iter 12 value -0.723248
## iter 12 value -0.723248
## iter 12 value -0.723248
## final value -0.723248
## converged
## initial value -0.722762
## iter 2 value -0.722764
## iter 3 value -0.722764
## iter 4 value -0.722765
## iter 4 value -0.722765
## iter 4 value -0.722765
## final value -0.722765
## converged
## $fit
##
## Call:
## stats::arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D,
## Q), period = S), xreg = xmean, include.mean = FALSE, optim.control = list(trace = trc,
## REPORT = 1, reltol = tol))
##
## Coefficients:
## ma1 xmean
## -0.7710 -0.0013
## s.e. 0.0341 0.0044
##
## sigma^2 estimated as 0.2353: log likelihood = -440.68, aic = 887.36
##
## $degrees_of_freedom
## [1] 631
##
## $ttable
## Estimate SE t.value p.value
## ma1 -0.7710 0.0341 -22.6002 0.0000
## xmean -0.0013 0.0044 -0.2818 0.7782
##
## $AIC
## [1] -0.4406366
##
## $AICc
## [1] -0.4374168
##
## $BIC
## [1] -1.426575
# Fit an MA(2) to dl_varve. Improvement?
astsa::sarima(dl_varve, p=0, d=0, q=2)
## initial value -0.551780
## iter 2 value -0.679736
## iter 3 value -0.728605
## iter 4 value -0.734640
## iter 5 value -0.735449
## iter 6 value -0.735979
## iter 7 value -0.736015
## iter 8 value -0.736059
## iter 9 value -0.736060
## iter 10 value -0.736060
## iter 11 value -0.736061
## iter 12 value -0.736061
## iter 12 value -0.736061
## iter 12 value -0.736061
## final value -0.736061
## converged
## initial value -0.735372
## iter 2 value -0.735378
## iter 3 value -0.735379
## iter 4 value -0.735379
## iter 4 value -0.735379
## iter 4 value -0.735379
## final value -0.735379
## converged
## $fit
##
## Call:
## stats::arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D,
## Q), period = S), xreg = xmean, include.mean = FALSE, optim.control = list(trace = trc,
## REPORT = 1, reltol = tol))
##
## Coefficients:
## ma1 ma2 xmean
## -0.6710 -0.1595 -0.0013
## s.e. 0.0375 0.0392 0.0033
##
## sigma^2 estimated as 0.2294: log likelihood = -432.69, aic = 873.39
##
## $degrees_of_freedom
## [1] 630
##
## $ttable
## Estimate SE t.value p.value
## ma1 -0.6710 0.0375 -17.9057 0.0000
## ma2 -0.1595 0.0392 -4.0667 0.0001
## xmean -0.0013 0.0033 -0.4007 0.6888
##
## $AIC
## [1] -0.4629629
##
## $AICc
## [1] -0.4597027
##
## $BIC
## [1] -1.441871
# Fit an ARMA(1,1) to dl_varve. Improvement?
astsa::sarima(dl_varve, p=1, d=0, q=1)
## initial value -0.550994
## iter 2 value -0.648962
## iter 3 value -0.676965
## iter 4 value -0.699167
## iter 5 value -0.724554
## iter 6 value -0.726719
## iter 7 value -0.729066
## iter 8 value -0.731976
## iter 9 value -0.734235
## iter 10 value -0.735969
## iter 11 value -0.736410
## iter 12 value -0.737045
## iter 13 value -0.737600
## iter 14 value -0.737641
## iter 15 value -0.737643
## iter 16 value -0.737643
## iter 17 value -0.737643
## iter 18 value -0.737643
## iter 18 value -0.737643
## iter 18 value -0.737643
## final value -0.737643
## converged
## initial value -0.737522
## iter 2 value -0.737527
## iter 3 value -0.737528
## iter 4 value -0.737529
## iter 5 value -0.737530
## iter 5 value -0.737530
## iter 5 value -0.737530
## final value -0.737530
## converged
## $fit
##
## Call:
## stats::arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D,
## Q), period = S), xreg = xmean, include.mean = FALSE, optim.control = list(trace = trc,
## REPORT = 1, reltol = tol))
##
## Coefficients:
## ar1 ma1 xmean
## 0.2341 -0.8871 -0.0013
## s.e. 0.0518 0.0292 0.0028
##
## sigma^2 estimated as 0.2284: log likelihood = -431.33, aic = 870.66
##
## $degrees_of_freedom
## [1] 630
##
## $ttable
## Estimate SE t.value p.value
## ar1 0.2341 0.0518 4.5184 0.0000
## ma1 -0.8871 0.0292 -30.4107 0.0000
## xmean -0.0013 0.0028 -0.4618 0.6444
##
## $AIC
## [1] -0.467376
##
## $AICc
## [1] -0.4641159
##
## $BIC
## [1] -1.446284
# Fit an MA(1) to dl_varve. Examine the residuals
astsa::sarima(dl_varve, p=0, d=0, q=1)
## initial value -0.551780
## iter 2 value -0.671633
## iter 3 value -0.706234
## iter 4 value -0.707586
## iter 5 value -0.718543
## iter 6 value -0.719692
## iter 7 value -0.721967
## iter 8 value -0.722970
## iter 9 value -0.723231
## iter 10 value -0.723247
## iter 11 value -0.723248
## iter 12 value -0.723248
## iter 12 value -0.723248
## iter 12 value -0.723248
## final value -0.723248
## converged
## initial value -0.722762
## iter 2 value -0.722764
## iter 3 value -0.722764
## iter 4 value -0.722765
## iter 4 value -0.722765
## iter 4 value -0.722765
## final value -0.722765
## converged
## $fit
##
## Call:
## stats::arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D,
## Q), period = S), xreg = xmean, include.mean = FALSE, optim.control = list(trace = trc,
## REPORT = 1, reltol = tol))
##
## Coefficients:
## ma1 xmean
## -0.7710 -0.0013
## s.e. 0.0341 0.0044
##
## sigma^2 estimated as 0.2353: log likelihood = -440.68, aic = 887.36
##
## $degrees_of_freedom
## [1] 631
##
## $ttable
## Estimate SE t.value p.value
## ma1 -0.7710 0.0341 -22.6002 0.0000
## xmean -0.0013 0.0044 -0.2818 0.7782
##
## $AIC
## [1] -0.4406366
##
## $AICc
## [1] -0.4374168
##
## $BIC
## [1] -1.426575
# Fit an ARMA(1,1) to dl_varve. Examine the residuals
astsa::sarima(dl_varve, p=1, d=0, q=1)
## initial value -0.550994
## iter 2 value -0.648962
## iter 3 value -0.676965
## iter 4 value -0.699167
## iter 5 value -0.724554
## iter 6 value -0.726719
## iter 7 value -0.729066
## iter 8 value -0.731976
## iter 9 value -0.734235
## iter 10 value -0.735969
## iter 11 value -0.736410
## iter 12 value -0.737045
## iter 13 value -0.737600
## iter 14 value -0.737641
## iter 15 value -0.737643
## iter 16 value -0.737643
## iter 17 value -0.737643
## iter 18 value -0.737643
## iter 18 value -0.737643
## iter 18 value -0.737643
## final value -0.737643
## converged
## initial value -0.737522
## iter 2 value -0.737527
## iter 3 value -0.737528
## iter 4 value -0.737529
## iter 5 value -0.737530
## iter 5 value -0.737530
## iter 5 value -0.737530
## final value -0.737530
## converged
## $fit
##
## Call:
## stats::arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D,
## Q), period = S), xreg = xmean, include.mean = FALSE, optim.control = list(trace = trc,
## REPORT = 1, reltol = tol))
##
## Coefficients:
## ar1 ma1 xmean
## 0.2341 -0.8871 -0.0013
## s.e. 0.0518 0.0292 0.0028
##
## sigma^2 estimated as 0.2284: log likelihood = -431.33, aic = 870.66
##
## $degrees_of_freedom
## [1] 630
##
## $ttable
## Estimate SE t.value p.value
## ar1 0.2341 0.0518 4.5184 0.0000
## ma1 -0.8871 0.0292 -30.4107 0.0000
## xmean -0.0013 0.0028 -0.4618 0.6444
##
## $AIC
## [1] -0.467376
##
## $AICc
## [1] -0.4641159
##
## $BIC
## [1] -1.446284
data(oil, package="astsa")
# Calculate approximate oil returns
oil_returns <- diff(log(oil))
# Plot oil_returns. Notice the outliers.
plot(oil_returns)
# Plot the P/ACF pair for oil_returns
astsa::acf2(oil_returns)
## ACF PACF
## [1,] 0.13 0.13
## [2,] -0.07 -0.09
## [3,] 0.13 0.16
## [4,] -0.01 -0.06
## [5,] 0.02 0.05
## [6,] -0.03 -0.08
## [7,] -0.03 0.00
## [8,] 0.13 0.12
## [9,] 0.08 0.05
## [10,] 0.02 0.03
## [11,] 0.01 -0.02
## [12,] 0.00 0.00
## [13,] -0.02 -0.03
## [14,] 0.06 0.09
## [15,] -0.05 -0.07
## [16,] -0.09 -0.06
## [17,] 0.03 0.01
## [18,] 0.05 0.04
## [19,] -0.05 -0.05
## [20,] -0.07 -0.05
## [21,] 0.04 0.05
## [22,] 0.09 0.06
## [23,] -0.05 -0.06
## [24,] -0.08 -0.05
## [25,] -0.07 -0.08
## [26,] 0.00 0.02
## [27,] -0.11 -0.11
## [28,] -0.07 0.01
## [29,] 0.02 0.00
## [30,] -0.02 -0.01
## [31,] -0.03 -0.05
## [32,] -0.05 -0.04
## [33,] -0.03 0.02
## [34,] 0.00 0.02
# Assuming both P/ACF are tailing, fit a model to oil_returns
astsa::sarima(oil_returns, p=1, d=0, q=1)
## initial value -3.057594
## iter 2 value -3.061420
## iter 3 value -3.067360
## iter 4 value -3.067479
## iter 5 value -3.071834
## iter 6 value -3.074359
## iter 7 value -3.074843
## iter 8 value -3.076656
## iter 9 value -3.080467
## iter 10 value -3.081546
## iter 11 value -3.081603
## iter 12 value -3.081615
## iter 13 value -3.081642
## iter 14 value -3.081643
## iter 14 value -3.081643
## iter 14 value -3.081643
## final value -3.081643
## converged
## initial value -3.082345
## iter 2 value -3.082345
## iter 3 value -3.082346
## iter 4 value -3.082346
## iter 5 value -3.082346
## iter 5 value -3.082346
## iter 5 value -3.082346
## final value -3.082346
## converged
## $fit
##
## Call:
## stats::arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D,
## Q), period = S), xreg = xmean, include.mean = FALSE, optim.control = list(trace = trc,
## REPORT = 1, reltol = tol))
##
## Coefficients:
## ar1 ma1 xmean
## -0.5264 0.7146 0.0018
## s.e. 0.0871 0.0683 0.0022
##
## sigma^2 estimated as 0.002102: log likelihood = 904.89, aic = -1801.79
##
## $degrees_of_freedom
## [1] 541
##
## $ttable
## Estimate SE t.value p.value
## ar1 -0.5264 0.0871 -6.0422 0.0000
## ma1 0.7146 0.0683 10.4699 0.0000
## xmean 0.0018 0.0022 0.7981 0.4252
##
## $AIC
## [1] -5.153838
##
## $AICc
## [1] -5.150025
##
## $BIC
## [1] -6.130131
Chapter 3 - ARIMA Models
ARIMA - Integrated ARMA fitted to non-stationary time series:
ARIMA Diagnostics - typical concerns about overfitting:
Forecasting ARIMA - the model describes the dynamics, which can be applied in to the future:
Example code includes:
x <- arima.sim(model = list(order = c(1, 1, 0), ar = .9), n = 200)
# Plot x
plot(x)
# Plot the P/ACF pair of x
astsa::acf2(x)
## ACF PACF
## [1,] 1.00 1.00
## [2,] 0.99 -0.15
## [3,] 0.98 -0.11
## [4,] 0.98 -0.10
## [5,] 0.97 -0.06
## [6,] 0.96 -0.06
## [7,] 0.95 -0.04
## [8,] 0.94 -0.02
## [9,] 0.93 -0.02
## [10,] 0.92 -0.02
## [11,] 0.91 -0.01
## [12,] 0.90 0.00
## [13,] 0.89 -0.01
## [14,] 0.87 -0.03
## [15,] 0.86 -0.03
## [16,] 0.85 -0.04
## [17,] 0.84 -0.04
## [18,] 0.82 -0.03
## [19,] 0.81 -0.02
## [20,] 0.80 -0.02
## [21,] 0.78 -0.02
## [22,] 0.77 -0.01
## [23,] 0.75 -0.04
## [24,] 0.74 -0.03
## [25,] 0.72 -0.02
# Plot the differenced data
plot(diff(x))
# Plot the P/ACF pair of the differenced data
astsa::acf2(diff(x))
## ACF PACF
## [1,] 0.88 0.88
## [2,] 0.76 -0.04
## [3,] 0.64 -0.09
## [4,] 0.54 0.02
## [5,] 0.44 -0.04
## [6,] 0.36 -0.02
## [7,] 0.29 -0.01
## [8,] 0.24 0.05
## [9,] 0.19 -0.06
## [10,] 0.18 0.13
## [11,] 0.16 -0.05
## [12,] 0.14 0.00
## [13,] 0.15 0.13
## [14,] 0.17 -0.01
## [15,] 0.16 -0.04
## [16,] 0.17 0.03
## [17,] 0.18 0.10
## [18,] 0.20 -0.03
## [19,] 0.20 0.01
## [20,] 0.18 -0.04
## [21,] 0.18 0.02
## [22,] 0.18 0.07
## [23,] 0.18 0.02
## [24,] 0.18 0.02
## [25,] 0.20 0.10
xData <- c( 2.071, 4.75, 6.674, 5.908, 3.886, 1.797, 0.649, 0.944, 1.755, 0.949, -0.321, -2.235, -4.472, -5.33, -3.556, 0.183, 6.393, 13.8, 20.431, 23.98, 24.522, 23.907, 23.27, 22.19, 20.059, 18.234, 17.08, 18.352, 21.234, 22.34, 21.248, 20.583, 19.799, 18.604, 19.393, 20.45, 21.861, 24.772, 29.022, 33.568, 38.256, 41.102, 42.96, 44.971, 47.002, 47.558, 47.397, 47.664, 47.592, 46.829, 46.66, 47.851, 51.184, 55.756, 60.053, 65.424, 71.336, 75.162, 77.131, 77.535, 76.534, 75.268, 74.917, 74.917, 74.447, 73.814, 71.874, 70.049, 68.571, 69.212, 72.331, 77.285, 82.489, 88.604, 94.093, 97.054, 99.208, 99.862, 100.939, 101.231, 101.496, 102.408, 103.906, 107.007, 111.464, 115.662, 119.608, 123.482, 125.956, 126.39, 126.386, 125.913, 125.488, 125.576, 126.291, 127.143, 127.52, 126.081, 124.965, 123.745, 122.581, 121.929, 123.325, 126.775, 132.555, 139.235, 144.934, 149.721, 154.382, 157.019, 157.206, 154.616, 148.832, 141.499, 135.467, 131.852, 132.204, 136.506, 142.587, 148.555, 150.681, 148.482, 142.889, 136.895, 131.35, 128.87, 127.53, 128.324, 131.564, 136.374, 142.986, 150.038, 155.446, 159.031, 159.776, 157.518, 155.821, 156.742, 159.896, 162.664, 164.717, 166.054, 164.365, 160.334, 153.985, 148.808, 146.378, 145.179, 145.683, 148.118, 152.318, 158.13, 164.868, 171.405, 177.053, 182.439, 186.528, 189.036, 191.453, 193.507, 196.097, 198.629, 200.216, 200.839, 201.791, 201.882, 201.844, 201.766, 204.88, 208.738, 212.117, 214.878, 218.935, 223.003, 227.042, 228.179, 227.576, 227.183, 227.895, 229.689, 232.106, 234.707, 234.405, 232.747, 232.052, 234.176, 237.706, 243.079, 247.933, 249.965, 251.077, 250.945, 250.302, 248.648, 248.404, 250.725, 255.209, 260.453, 264.559, 268.147, 269.122, 267.308, 262.819, 258.705, 255.487, 253.049, 251.807, 251.932, 253.196, 256.489, 259.875, 263.342, 266.208, 266.414, 265.439, 264.196, 264.413, 266.275, 270.239, 276.725, 283.784, 289.445, 292.879, 293.287, 292.272, 290.836, 288.097, 285.868, 283.051, 281.694, 281.11, 281.1, 282.375, 284.273, 286.304, 290.172, 296.595, 303.989, 310.565, 315.547, 317.702, 317.364, 313.184, 306.788, 300.193, 295.649, 293.628, 296.013, 301.313, 306.754 )
x <- ts(data=xData, frequency=1, start=c(1, 1))
str(x)
## Time-Series [1:250] from 1 to 250: 2.07 4.75 6.67 5.91 3.89 ...
y <- diff(x)
# Plot sample P/ACF of differenced data and determine model
astsa::acf2(diff(x))
## ACF PACF
## [1,] 0.86 0.86
## [2,] 0.53 -0.75
## [3,] 0.15 -0.04
## [4,] -0.18 -0.05
## [5,] -0.41 -0.04
## [6,] -0.49 0.03
## [7,] -0.45 -0.09
## [8,] -0.32 0.01
## [9,] -0.17 -0.10
## [10,] -0.04 -0.08
## [11,] 0.05 0.02
## [12,] 0.08 -0.10
## [13,] 0.06 -0.09
## [14,] 0.00 -0.03
## [15,] -0.04 0.04
## [16,] -0.07 -0.07
## [17,] -0.06 0.03
## [18,] -0.01 0.07
## [19,] 0.06 0.01
## [20,] 0.14 0.06
## [21,] 0.20 -0.08
## [22,] 0.20 0.01
## [23,] 0.15 -0.02
## [24,] 0.07 0.06
## [25,] -0.02 0.04
## [26,] -0.07 0.03
# Estimate parameters and examine output
astsa::sarima(x, p=2, d=1, q=0)
## initial value 1.127641
## iter 2 value 0.983533
## iter 3 value 0.570293
## iter 4 value 0.314868
## iter 5 value 0.100372
## iter 6 value 0.063137
## iter 7 value 0.007514
## iter 8 value 0.005891
## iter 9 value 0.005789
## iter 10 value 0.005620
## iter 11 value 0.005527
## iter 12 value 0.005526
## iter 13 value 0.005526
## iter 13 value 0.005526
## iter 13 value 0.005526
## final value 0.005526
## converged
## initial value 0.008531
## iter 2 value 0.008509
## iter 3 value 0.008495
## iter 4 value 0.008495
## iter 5 value 0.008495
## iter 6 value 0.008495
## iter 6 value 0.008495
## iter 6 value 0.008495
## final value 0.008495
## converged
## $fit
##
## Call:
## stats::arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D,
## Q), period = S), xreg = constant, optim.control = list(trace = trc, REPORT = 1,
## reltol = tol))
##
## Coefficients:
## ar1 ar2 constant
## 1.5197 -0.7669 1.2335
## s.e. 0.0401 0.0401 0.2570
##
## sigma^2 estimated as 1.004: log likelihood = -355.43, aic = 718.86
##
## $degrees_of_freedom
## [1] 247
##
## $ttable
## Estimate SE t.value p.value
## ar1 1.5197 0.0401 37.9154 0
## ar2 -0.7669 0.0401 -19.1298 0
## constant 1.2335 0.2570 4.7992 0
##
## $AIC
## [1] 1.028458
##
## $AICc
## [1] 1.037112
##
## $BIC
## [1] 0.07071602
data(globtemp, package="astsa")
# Plot the sample P/ACF pair of the differenced data
astsa::acf2(diff(globtemp))
## ACF PACF
## [1,] -0.24 -0.24
## [2,] -0.19 -0.26
## [3,] -0.08 -0.23
## [4,] 0.20 0.06
## [5,] -0.15 -0.16
## [6,] -0.03 -0.09
## [7,] 0.03 -0.05
## [8,] 0.14 0.07
## [9,] -0.16 -0.09
## [10,] 0.11 0.11
## [11,] -0.05 -0.03
## [12,] 0.00 -0.02
## [13,] -0.13 -0.10
## [14,] 0.14 0.02
## [15,] -0.01 0.00
## [16,] -0.08 -0.09
## [17,] 0.00 0.00
## [18,] 0.19 0.11
## [19,] -0.07 0.04
## [20,] 0.02 0.13
## [21,] -0.02 0.09
## [22,] 0.08 0.08
# Fit an ARIMA(1,1,1) model to globtemp
astsa::sarima(globtemp, p=1, d=1, q=1)
## initial value -2.218917
## iter 2 value -2.253118
## iter 3 value -2.263750
## iter 4 value -2.272144
## iter 5 value -2.282786
## iter 6 value -2.296777
## iter 7 value -2.297062
## iter 8 value -2.297253
## iter 9 value -2.297389
## iter 10 value -2.297405
## iter 11 value -2.297413
## iter 12 value -2.297413
## iter 13 value -2.297414
## iter 13 value -2.297414
## iter 13 value -2.297414
## final value -2.297414
## converged
## initial value -2.305504
## iter 2 value -2.305800
## iter 3 value -2.305821
## iter 4 value -2.306655
## iter 5 value -2.306875
## iter 6 value -2.306950
## iter 7 value -2.306955
## iter 8 value -2.306955
## iter 8 value -2.306955
## final value -2.306955
## converged
## $fit
##
## Call:
## stats::arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D,
## Q), period = S), xreg = constant, optim.control = list(trace = trc, REPORT = 1,
## reltol = tol))
##
## Coefficients:
## ar1 ma1 constant
## 0.3549 -0.7663 0.0072
## s.e. 0.1314 0.0874 0.0032
##
## sigma^2 estimated as 0.009885: log likelihood = 119.88, aic = -231.76
##
## $degrees_of_freedom
## [1] 133
##
## $ttable
## Estimate SE t.value p.value
## ar1 0.3549 0.1314 2.7008 0.0078
## ma1 -0.7663 0.0874 -8.7701 0.0000
## constant 0.0072 0.0032 2.2738 0.0246
##
## $AIC
## [1] -3.572642
##
## $AICc
## [1] -3.555691
##
## $BIC
## [1] -4.508392
# Fit an ARIMA(0,1,2) model to globtemp. Which model is better?
astsa::sarima(globtemp, p=0, d=1, q=2)
## initial value -2.220513
## iter 2 value -2.294887
## iter 3 value -2.307682
## iter 4 value -2.309170
## iter 5 value -2.310360
## iter 6 value -2.311251
## iter 7 value -2.311636
## iter 8 value -2.311648
## iter 9 value -2.311649
## iter 9 value -2.311649
## iter 9 value -2.311649
## final value -2.311649
## converged
## initial value -2.310187
## iter 2 value -2.310197
## iter 3 value -2.310199
## iter 4 value -2.310201
## iter 5 value -2.310202
## iter 5 value -2.310202
## iter 5 value -2.310202
## final value -2.310202
## converged
## $fit
##
## Call:
## stats::arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D,
## Q), period = S), xreg = constant, optim.control = list(trace = trc, REPORT = 1,
## reltol = tol))
##
## Coefficients:
## ma1 ma2 constant
## -0.3984 -0.2173 0.0072
## s.e. 0.0808 0.0768 0.0033
##
## sigma^2 estimated as 0.00982: log likelihood = 120.32, aic = -232.64
##
## $degrees_of_freedom
## [1] 133
##
## $ttable
## Estimate SE t.value p.value
## ma1 -0.3984 0.0808 -4.9313 0.0000
## ma2 -0.2173 0.0768 -2.8303 0.0054
## constant 0.0072 0.0033 2.1463 0.0337
##
## $AIC
## [1] -3.579224
##
## $AICc
## [1] -3.562273
##
## $BIC
## [1] -4.514974
xData <- c( -0.0751, 0.1473, 1.8112, 4.8931, 7.0292, 8.1352, 9.0227, 10.3904, 11.9989, 11.4527, 11.2707, 12.5312, 12.1963, 10.7977, 12.0651, 13.5885, 12.4802, 11.709, 10.9356, 12.3663, 14.3876, 14.2129, 13.5661, 12.9155, 13.4154, 14.9105, 16.2552, 16.7393, 17.1447, 18.0555, 19.7376, 22.5407, 24.7367, 24.8413, 24.2488, 24.2967, 24.2308, 23.8902, 23.7027, 23.119, 22.7335, 22.9657, 23.8808, 24.4345, 24.2466, 23.4257, 20.8514, 19.4998, 19.9398, 20.2972, 20.7262, 20.1964, 17.5082, 15.9907, 15.4264, 14.1124, 14.4446, 16.3402, 17.577, 19.4557, 21.6471, 22.1894, 21.0641, 20.0541, 21.0169, 22.3758, 21.9696, 20.0109, 19.2389, 19.2861, 20.4638, 21.5998, 18.9907, 15.9218, 16.751, 17.3235, 15.8171, 16.9022, 17.2296, 16.2838, 17.8028, 19.7293, 20.4888, 21.4197, 21.1516, 21.1138, 23.0237, 24.211, 23.1522, 22.3539, 23.3107, 23.1071, 21.6763, 21.7444, 23.002, 24.7646, 26.0639, 25.9787, 27.8355, 30.5886, 30.1021, 29.4103, 29.8847, 29.5996, 29.5772, 30.4156, 30.2665, 28.7099, 27.6781, 25.9568, 24.9156, 24.8254, 25.6952, 27.641, 28.8981, 29.2489, 30.9297, 32.5278, 31.5972, 32.3645, 33.2106, 34.1595, 34.4231, 33.8642, 34.7263, 35.2714, 36.6619, 38.5322, 38.7635, 39.1658, 40.7182, 40.891, 39.7363, 40.1594, 40.6549, 40.3654, 40.5468, 40.7007, 40.3408, 39.3942, 37.2571, 36.9096, 37.0338, 35.8572, 35.4378, 36.6571, 38.4328, 40.4212, 42.0617, 42.1701, 42.9875, 45.4235, 45.7948, 44.3909, 42.8091, 39.8039, 37.1785, 36.8238, 36.8816, 37.6287, 39.3721, 39.7785, 39.3112, 36.6673, 33.274, 31.3097, 30.9826, 30.462, 30.6871, 29.6729, 28.5721, 30.0226, 31.0649, 32.9386, 34.8814, 34.8945, 35.0234, 34.6894, 33.0402, 34.2274, 37.5808, 39.2334, 37.9677, 36.6451, 36.7756, 34.4778, 31.6004, 29.1428, 28.61, 29.9308, 28.5681, 27.3121, 28.0795, 29.2628, 30.9914, 32.9232, 34.3216, 35.4834, 37.6638, 39.102, 39.2936, 40.9448, 42.3607, 43.5172, 44.4513, 43.9077, 43.3648, 44.2566, 44.0296, 43.3438, 43.433, 46.2347, 47.8019, 46.502, 46.5795, 49.1136, 50.928, 51.5114, 50.0802, 48.6748, 50.2435, 51.8771, 52.6298, 52.8352, 52.9461, 50.4009, 48.5522, 50.3446, 53.2334, 54.3444, 55.4121, 55.9148, 53.7499, 53.9132, 54.7285, 54.4254, 53.5442, 54.1458, 56.728, 58.4062, 58.9589, 58.3515, 58.9129, 58.3679, 56.145, 54.1373, 54.0196, 54.2961, 52.784, 51.715 )
x <- ts(data=xData, frequency=1, start=c(1, 1))
str(x)
## Time-Series [1:250] from 1 to 250: -0.0751 0.1473 1.8112 4.8931 7.0292 ...
# Plot sample P/ACF pair of the differenced data
astsa::acf2(diff(x))
## ACF PACF
## [1,] 0.49 0.49
## [2,] -0.02 -0.34
## [3,] 0.02 0.29
## [4,] 0.03 -0.23
## [5,] -0.01 0.18
## [6,] -0.02 -0.17
## [7,] -0.09 0.01
## [8,] -0.07 -0.01
## [9,] -0.02 -0.03
## [10,] -0.10 -0.13
## [11,] -0.10 0.09
## [12,] 0.00 -0.08
## [13,] -0.03 0.00
## [14,] -0.10 -0.11
## [15,] -0.07 0.05
## [16,] -0.03 -0.06
## [17,] 0.01 0.07
## [18,] 0.02 -0.07
## [19,] -0.02 0.02
## [20,] 0.00 0.02
## [21,] 0.10 0.08
## [22,] 0.15 0.09
## [23,] 0.12 -0.02
## [24,] 0.01 -0.06
## [25,] -0.04 0.01
## [26,] 0.02 0.03
# Fit the first model, compare parameters, check diagnostics
astsa::sarima(x, p=0, d=1, q=1)
## initial value 0.282663
## iter 2 value 0.086381
## iter 3 value 0.013882
## iter 4 value -0.019189
## iter 5 value -0.020178
## iter 6 value -0.020411
## iter 7 value -0.020429
## iter 8 value -0.020430
## iter 9 value -0.020431
## iter 10 value -0.020431
## iter 11 value -0.020431
## iter 12 value -0.020431
## iter 12 value -0.020431
## iter 12 value -0.020431
## final value -0.020431
## converged
## initial value -0.016992
## iter 2 value -0.017046
## iter 3 value -0.017049
## iter 4 value -0.017050
## iter 5 value -0.017050
## iter 6 value -0.017050
## iter 7 value -0.017050
## iter 8 value -0.017050
## iter 8 value -0.017050
## iter 8 value -0.017050
## final value -0.017050
## converged
## $fit
##
## Call:
## stats::arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D,
## Q), period = S), xreg = constant, optim.control = list(trace = trc, REPORT = 1,
## reltol = tol))
##
## Coefficients:
## ma1 constant
## 0.9065 0.2063
## s.e. 0.0323 0.1181
##
## sigma^2 estimated as 0.9598: log likelihood = -349.07, aic = 704.14
##
## $degrees_of_freedom
## [1] 248
##
## $ttable
## Estimate SE t.value p.value
## ma1 0.9065 0.0323 28.0497 0.0000
## constant 0.2063 0.1181 1.7459 0.0821
##
## $AIC
## [1] 0.9749726
##
## $AICc
## [1] 0.9833628
##
## $BIC
## [1] 0.003144257
# Fit the second model and compare fit
astsa::sarima(x, p=0, d=1, q=2)
## initial value 0.282663
## iter 2 value 0.082436
## iter 3 value 0.052466
## iter 4 value -0.014265
## iter 5 value -0.018249
## iter 6 value -0.019318
## iter 7 value -0.020294
## iter 8 value -0.020432
## iter 9 value -0.020432
## iter 10 value -0.020433
## iter 11 value -0.020433
## iter 12 value -0.020433
## iter 13 value -0.020433
## iter 13 value -0.020433
## iter 13 value -0.020433
## final value -0.020433
## converged
## initial value -0.016998
## iter 2 value -0.017045
## iter 3 value -0.017056
## iter 4 value -0.017057
## iter 5 value -0.017058
## iter 6 value -0.017058
## iter 7 value -0.017058
## iter 8 value -0.017058
## iter 8 value -0.017058
## final value -0.017058
## converged
## $fit
##
## Call:
## stats::arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D,
## Q), period = S), xreg = constant, optim.control = list(trace = trc, REPORT = 1,
## reltol = tol))
##
## Coefficients:
## ma1 ma2 constant
## 0.9099 0.0041 0.2063
## s.e. 0.0651 0.0684 0.1186
##
## sigma^2 estimated as 0.9598: log likelihood = -349.07, aic = 706.14
##
## $degrees_of_freedom
## [1] 247
##
## $ttable
## Estimate SE t.value p.value
## ma1 0.9099 0.0651 13.9821 0.0000
## ma2 0.0041 0.0684 0.0602 0.9521
## constant 0.2063 0.1186 1.7391 0.0833
##
## $AIC
## [1] 0.9829715
##
## $AICc
## [1] 0.9916246
##
## $BIC
## [1] 0.02522905
# Fit ARIMA(0,1,2) to globtemp and check diagnostics
astsa::sarima(globtemp, p=0, d=1, q=2)
## initial value -2.220513
## iter 2 value -2.294887
## iter 3 value -2.307682
## iter 4 value -2.309170
## iter 5 value -2.310360
## iter 6 value -2.311251
## iter 7 value -2.311636
## iter 8 value -2.311648
## iter 9 value -2.311649
## iter 9 value -2.311649
## iter 9 value -2.311649
## final value -2.311649
## converged
## initial value -2.310187
## iter 2 value -2.310197
## iter 3 value -2.310199
## iter 4 value -2.310201
## iter 5 value -2.310202
## iter 5 value -2.310202
## iter 5 value -2.310202
## final value -2.310202
## converged
## $fit
##
## Call:
## stats::arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D,
## Q), period = S), xreg = constant, optim.control = list(trace = trc, REPORT = 1,
## reltol = tol))
##
## Coefficients:
## ma1 ma2 constant
## -0.3984 -0.2173 0.0072
## s.e. 0.0808 0.0768 0.0033
##
## sigma^2 estimated as 0.00982: log likelihood = 120.32, aic = -232.64
##
## $degrees_of_freedom
## [1] 133
##
## $ttable
## Estimate SE t.value p.value
## ma1 -0.3984 0.0808 -4.9313 0.0000
## ma2 -0.2173 0.0768 -2.8303 0.0054
## constant 0.0072 0.0033 2.1463 0.0337
##
## $AIC
## [1] -3.579224
##
## $AICc
## [1] -3.562273
##
## $BIC
## [1] -4.514974
# Fit ARIMA(1,1,1) to globtemp and check diagnostics
astsa::sarima(globtemp, p=1, d=1, q=1)
## initial value -2.218917
## iter 2 value -2.253118
## iter 3 value -2.263750
## iter 4 value -2.272144
## iter 5 value -2.282786
## iter 6 value -2.296777
## iter 7 value -2.297062
## iter 8 value -2.297253
## iter 9 value -2.297389
## iter 10 value -2.297405
## iter 11 value -2.297413
## iter 12 value -2.297413
## iter 13 value -2.297414
## iter 13 value -2.297414
## iter 13 value -2.297414
## final value -2.297414
## converged
## initial value -2.305504
## iter 2 value -2.305800
## iter 3 value -2.305821
## iter 4 value -2.306655
## iter 5 value -2.306875
## iter 6 value -2.306950
## iter 7 value -2.306955
## iter 8 value -2.306955
## iter 8 value -2.306955
## final value -2.306955
## converged
## $fit
##
## Call:
## stats::arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D,
## Q), period = S), xreg = constant, optim.control = list(trace = trc, REPORT = 1,
## reltol = tol))
##
## Coefficients:
## ar1 ma1 constant
## 0.3549 -0.7663 0.0072
## s.e. 0.1314 0.0874 0.0032
##
## sigma^2 estimated as 0.009885: log likelihood = 119.88, aic = -231.76
##
## $degrees_of_freedom
## [1] 133
##
## $ttable
## Estimate SE t.value p.value
## ar1 0.3549 0.1314 2.7008 0.0078
## ma1 -0.7663 0.0874 -8.7701 0.0000
## constant 0.0072 0.0032 2.2738 0.0246
##
## $AIC
## [1] -3.572642
##
## $AICc
## [1] -3.555691
##
## $BIC
## [1] -4.508392
yData <- c( 1.475, 3.061, 6.53, 9.844, 15.735, 20.798, 24.635, 27.322, 28.793, 30.4, 31.672, 32.209, 33.255, 35.53, 35.87, 35.65, 35.766, 34.509, 32.438, 30.804, 30.913, 29.845, 28.667, 27.555, 26.962, 26.649, 28.018, 30.804, 34.625, 38.363, 41.745, 46.059, 51.431, 56.778, 61.529, 65.51, 69.054, 70.332, 72.318, 73.341, 74.756, 77.632, 78.618, 78.419, 78.412, 80.362, 82.771, 84.24, 86.619, 89.241, 93.318, 95.566, 98.509, 102.085, 105.017, 107.242, 107.946, 107.948, 107.554, 106.475, 105.517, 104.357, 104.296, 103.946, 102.896, 102.218, 102.796, 102.726, 101.759, 101.336, 100.97, 101.816, 101.736, 100.882, 100.974, 101.784, 101.409, 102.486, 102.971, 103.105, 103.886, 104.559, 104.349, 104.152, 105.461, 106.456, 106.611, 106.827, 108.587, 110.033, 110.993, 113.209, 113.397, 113.575, 113.945, 113.785, 113.473, 112.939, 112.222, 110.297, 108.388, 108.208, 107.125, 105.905, 103.513, 102.305, 102.325, 103.09, 104.299, 104.13, 104.388, 104.854, 106.697, 109.026, 110.97, 112.576, 113.896, 115.206, 116.374, 117.487 )
y <- ts(data=yData, frequency=1, start=c(1, 1))
str(y)
## Time-Series [1:120] from 1 to 120: 1.48 3.06 6.53 9.84 15.73 ...
x <- window(y, end=c(100, 1))
str(x)
## Time-Series [1:100] from 1 to 100: 1.48 3.06 6.53 9.84 15.73 ...
# Plot P/ACF pair of differenced data
astsa::acf2(diff(x))
## ACF PACF
## [1,] 0.83 0.83
## [2,] 0.69 -0.01
## [3,] 0.59 0.05
## [4,] 0.46 -0.13
## [5,] 0.32 -0.14
## [6,] 0.19 -0.08
## [7,] 0.09 0.02
## [8,] -0.02 -0.14
## [9,] -0.10 0.01
## [10,] -0.20 -0.17
## [11,] -0.25 0.08
## [12,] -0.23 0.11
## [13,] -0.22 0.00
## [14,] -0.21 0.00
## [15,] -0.21 -0.12
## [16,] -0.15 0.12
## [17,] -0.10 0.01
## [18,] -0.05 0.03
## [19,] -0.01 -0.02
## [20,] 0.04 0.00
# Fit model - check t-table and diagnostics
astsa::sarima(x, p=1, d=1, q=0)
## initial value 0.591964
## iter 2 value -0.038076
## iter 3 value -0.039015
## iter 4 value -0.039144
## iter 5 value -0.039245
## iter 6 value -0.039461
## iter 7 value -0.039501
## iter 8 value -0.039514
## iter 9 value -0.039528
## iter 10 value -0.039550
## iter 11 value -0.039561
## iter 12 value -0.039564
## iter 13 value -0.039564
## iter 14 value -0.039564
## iter 15 value -0.039564
## iter 16 value -0.039564
## iter 17 value -0.039564
## iter 17 value -0.039564
## iter 17 value -0.039564
## final value -0.039564
## converged
## initial value -0.037148
## iter 2 value -0.037210
## iter 3 value -0.037327
## iter 4 value -0.037336
## iter 5 value -0.037368
## iter 6 value -0.037369
## iter 7 value -0.037369
## iter 8 value -0.037369
## iter 9 value -0.037369
## iter 10 value -0.037369
## iter 10 value -0.037369
## iter 10 value -0.037369
## final value -0.037369
## converged
## $fit
##
## Call:
## stats::arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D,
## Q), period = S), xreg = constant, optim.control = list(trace = trc, REPORT = 1,
## reltol = tol))
##
## Coefficients:
## ar1 constant
## 0.8504 0.9685
## s.e. 0.0525 0.6111
##
## sigma^2 estimated as 0.916: log likelihood = -136.78, aic = 279.55
##
## $degrees_of_freedom
## [1] 98
##
## $ttable
## Estimate SE t.value p.value
## ar1 0.8504 0.0525 16.1970 0.0000
## constant 0.9685 0.6111 1.5849 0.1162
##
## $AIC
## [1] 0.9522847
##
## $AICc
## [1] 0.9747847
##
## $BIC
## [1] 0.004388103
# Forecast the data 20 time periods ahead
astsa::sarima.for(x, n.ahead = 20, p = 1, d = 1, q = 0)
## $pred
## Time Series:
## Start = 101
## End = 120
## Frequency = 1
## [1] 108.8047 107.6805 106.8692 106.3241 106.0054 105.8792 105.9167
## [8] 106.0934 106.3886 106.7844 107.2659 107.8202 108.4365 109.1054
## [15] 109.8192 110.5710 111.3552 112.1670 113.0022 113.8574
##
## $se
## Time Series:
## Start = 101
## End = 120
## Frequency = 1
## [1] 0.9570902 2.0131099 3.1812378 4.4084826 5.6617802 6.9197771
## [7] 8.1684230 9.3984599 10.6038817 11.7809361 12.9274522 14.0423743
## [13] 15.1254331 16.1769097 17.1974643 18.1880125 19.1496340 20.0835066
## [19] 20.9908583 21.8729318
lines(y)
# Fit an ARIMA(0,1,2) to globtemp and check the fit
astsa::sarima(globtemp, p=0, d=1, q=2)
## initial value -2.220513
## iter 2 value -2.294887
## iter 3 value -2.307682
## iter 4 value -2.309170
## iter 5 value -2.310360
## iter 6 value -2.311251
## iter 7 value -2.311636
## iter 8 value -2.311648
## iter 9 value -2.311649
## iter 9 value -2.311649
## iter 9 value -2.311649
## final value -2.311649
## converged
## initial value -2.310187
## iter 2 value -2.310197
## iter 3 value -2.310199
## iter 4 value -2.310201
## iter 5 value -2.310202
## iter 5 value -2.310202
## iter 5 value -2.310202
## final value -2.310202
## converged
## $fit
##
## Call:
## stats::arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D,
## Q), period = S), xreg = constant, optim.control = list(trace = trc, REPORT = 1,
## reltol = tol))
##
## Coefficients:
## ma1 ma2 constant
## -0.3984 -0.2173 0.0072
## s.e. 0.0808 0.0768 0.0033
##
## sigma^2 estimated as 0.00982: log likelihood = 120.32, aic = -232.64
##
## $degrees_of_freedom
## [1] 133
##
## $ttable
## Estimate SE t.value p.value
## ma1 -0.3984 0.0808 -4.9313 0.0000
## ma2 -0.2173 0.0768 -2.8303 0.0054
## constant 0.0072 0.0033 2.1463 0.0337
##
## $AIC
## [1] -3.579224
##
## $AICc
## [1] -3.562273
##
## $BIC
## [1] -4.514974
# Forecast data 35 years into the future
astsa::sarima.for(globtemp, n.ahead=35, p=0, d=1, q=2)
## $pred
## Time Series:
## Start = 2016
## End = 2050
## Frequency = 1
## [1] 0.7995567 0.7745381 0.7816919 0.7888457 0.7959996 0.8031534 0.8103072
## [8] 0.8174611 0.8246149 0.8317688 0.8389226 0.8460764 0.8532303 0.8603841
## [15] 0.8675379 0.8746918 0.8818456 0.8889995 0.8961533 0.9033071 0.9104610
## [22] 0.9176148 0.9247687 0.9319225 0.9390763 0.9462302 0.9533840 0.9605378
## [29] 0.9676917 0.9748455 0.9819994 0.9891532 0.9963070 1.0034609 1.0106147
##
## $se
## Time Series:
## Start = 2016
## End = 2050
## Frequency = 1
## [1] 0.09909556 0.11564576 0.12175580 0.12757353 0.13313729 0.13847769
## [7] 0.14361964 0.14858376 0.15338730 0.15804492 0.16256915 0.16697084
## [13] 0.17125943 0.17544322 0.17952954 0.18352490 0.18743511 0.19126540
## [19] 0.19502047 0.19870459 0.20232164 0.20587515 0.20936836 0.21280424
## [25] 0.21618551 0.21951471 0.22279416 0.22602604 0.22921235 0.23235497
## [31] 0.23545565 0.23851603 0.24153763 0.24452190 0.24747019
Chapter 4 - Seasonal ARIMA
Pure Seasonal Models - often collect data with known seasonal patterns (quarterly, monthly, etc.):
Mixed Seasonal Models - purely seasonal models are rare, so the mixed model is more common:
Forecasting Seasonal ARIMA - relatively easy using astsa::sarima.for():
Example code includes:
xData <- c( -3.063, -1.997, -3.925, 5.37, 7.47, 0.502, 2.477, -10.093, -3.462, 1.835, 3.802, 1.853, -1.945, -1.881, -4.783, 4.361, 7.159, 2.699, 0.237, -9.933, -3.406, 0.718, 2.713, 2.309, -1.308, -0.573, -5.37, 3.053, 7.749, 3.926, -0.354, -10.326, -1.302, 1.796, 1.537, 4.596, -0.938, -0.753, -5.059, 3.346, 7.319, 2.802, 0.236, -9.541, -1.466, 3.829, 1.562, 3.934, -0.795, -0.32, -4.607, 2.947, 6.479, 0.403, 0.413, -8.069, -2.512, 4.105, 0.449, 1.274, -0.561, -0.346, -2.933, 2.525, 5.876, -1.374, -0.833, -8.193, -1.465, 5.502, 0.145, 1.336, -0.097, 0.893, -2.447, 2.869, 4.522, -1.133, -0.961, -8.43, -1.324, 6.856, 0.561, 1.842, -0.454, 2.786, -4.908, 2.909, 3.65, -0.681, -1.064, -6.475, 0.313, 6.849, 2.605, 3.129, -0.627, 2.904, -6.023, 1.976, 3.745, -1.207, -0.231, -5.569, 0.116, 4.874, 3.749, 4.216, -0.801, 2.669, -3.866, 3.526, 3.61, -0.298, -0.366, -5.148, -1.465, 2.259, 3.214, 4.789, -0.784, 2.858, -3.764, 3.885, 2.725, 1.297, -1.534, -4.081, -2.081, -0.05, 1.18, 4.582, -2.742, 1.99, -2.828, 4.169, 0.753, 2.19, -1.838, -2.821, -4.067, -1.38, 0.983, 4.561, -3.011, 0.569, -3.255, 2.012, -0.396, 1.63, -1.766, -2.187, -2.507, -1.296, 1.745, 4.975, -3.102, 1.36, -2.611, -0.109, 1.388, 1.727, -2.49, -3.813, -1.957, -0.572, 2.379, 5.92, -5.054, 1.698, -2.621, -1.539, 1.802, 1.932, -1.406, -5.839, -3.011, -0.79, 2.08, 4.144, -6.072, 2.374, -2.659, -2.098, 0.722, 2.443, -1.122, -5.98, -4.85, -0.712, 1.868, 2.127, -6.854, 1.91, -3.205, -1.139, 0.581, 1.527, -2.051, -6.724, -4.612, -1.236, 0.59, 0.828, -7.434, 0.602, -4.288, -1.825, -0.242, 0.107, -2.541, -7.618, -4.066, 0.323, 0.167, 0.145, -6.404, 0.585, -3.075, -3.812, -2.484, 0.783, -2.512, -7.77, -4.389, 2.426, 0.607, 0.47, -5.934, 1.551, -1.288, -3.312, -3.321, 2.478, -1.351, -10.693, -5.375, 3.161, -0.474, 2.11, -6.453, 0.999, -0.473, -2.442, -3.74, 3.271, -2.57, -10.644, -3.972, 2.408, 0.068, 3.375 )
x <- ts(data=xData, frequency=12, start=c(1, 1))
str(x)
## Time-Series [1:252] from 1 to 21.9: -3.06 -2 -3.92 5.37 7.47 ...
# Plot sample P/ACF to lag 60 and compare to the true values
astsa::acf2(x, max.lag = 60)
## ACF PACF
## [1,] 0.13 0.13
## [2,] -0.16 -0.18
## [3,] -0.35 -0.32
## [4,] -0.13 -0.09
## [5,] 0.27 0.22
## [6,] 0.26 0.11
## [7,] 0.29 0.31
## [8,] -0.12 0.03
## [9,] -0.34 -0.18
## [10,] -0.14 -0.02
## [11,] 0.11 0.00
## [12,] 0.89 0.84
## [13,] 0.12 -0.15
## [14,] -0.15 0.12
## [15,] -0.33 0.06
## [16,] -0.12 0.06
## [17,] 0.25 0.01
## [18,] 0.24 -0.06
## [19,] 0.28 -0.10
## [20,] -0.12 -0.06
## [21,] -0.32 0.00
## [22,] -0.11 0.07
## [23,] 0.09 -0.05
## [24,] 0.76 -0.14
## [25,] 0.09 0.00
## [26,] -0.14 -0.02
## [27,] -0.32 -0.03
## [28,] -0.12 -0.05
## [29,] 0.23 -0.03
## [30,] 0.22 0.00
## [31,] 0.25 0.00
## [32,] -0.13 0.02
## [33,] -0.31 -0.02
## [34,] -0.07 0.04
## [35,] 0.08 -0.02
## [36,] 0.65 0.07
## [37,] 0.06 -0.03
## [38,] -0.14 -0.05
## [39,] -0.30 -0.01
## [40,] -0.12 0.01
## [41,] 0.20 -0.13
## [42,] 0.19 -0.03
## [43,] 0.22 -0.06
## [44,] -0.13 -0.03
## [45,] -0.30 -0.02
## [46,] -0.03 0.02
## [47,] 0.06 -0.02
## [48,] 0.56 0.02
## [49,] 0.04 0.08
## [50,] -0.13 0.04
## [51,] -0.29 -0.02
## [52,] -0.11 0.01
## [53,] 0.17 0.02
## [54,] 0.16 -0.03
## [55,] 0.19 0.01
## [56,] -0.14 -0.07
## [57,] -0.28 0.00
## [58,] -0.02 -0.03
## [59,] 0.05 0.01
## [60,] 0.49 0.01
# Fit the seasonal model to x
astsa::sarima(x, p = 0, d = 0, q = 0, P = 1, D = 0, Q = 1, S = 12)
## initial value 1.274226
## iter 2 value 0.228901
## iter 3 value 0.028957
## iter 4 value 0.010808
## iter 5 value -0.002171
## iter 6 value -0.017847
## iter 7 value -0.018632
## iter 8 value -0.018759
## iter 9 value -0.018822
## iter 10 value -0.019245
## iter 11 value -0.019842
## iter 12 value -0.020194
## iter 13 value -0.020236
## iter 14 value -0.020241
## iter 15 value -0.020241
## iter 15 value -0.020241
## final value -0.020241
## converged
## initial value 0.064889
## iter 2 value 0.063302
## iter 3 value 0.061944
## iter 4 value 0.061263
## iter 5 value 0.061164
## iter 6 value 0.061036
## iter 7 value 0.060772
## iter 8 value 0.060428
## iter 9 value 0.060343
## iter 10 value 0.060260
## iter 11 value 0.060192
## iter 12 value 0.060181
## iter 13 value 0.060178
## iter 14 value 0.060174
## iter 15 value 0.060165
## iter 16 value 0.060160
## iter 17 value 0.060159
## iter 18 value 0.060151
## iter 19 value 0.060150
## iter 20 value 0.060149
## iter 21 value 0.060149
## iter 22 value 0.060148
## iter 23 value 0.060148
## iter 24 value 0.060148
## iter 25 value 0.060148
## iter 25 value 0.060148
## iter 25 value 0.060148
## final value 0.060148
## converged
## $fit
##
## Call:
## stats::arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D,
## Q), period = S), xreg = xmean, include.mean = FALSE, optim.control = list(trace = trc,
## REPORT = 1, reltol = tol))
##
## Coefficients:
## sar1 sma1 xmean
## 0.9310 0.4825 -0.5765
## s.e. 0.0204 0.0633 0.8797
##
## sigma^2 estimated as 0.9766: log likelihood = -372.73, aic = 753.46
##
## $degrees_of_freedom
## [1] 249
##
## $ttable
## Estimate SE t.value p.value
## sar1 0.9310 0.0204 45.6128 0.0000
## sma1 0.4825 0.0633 7.6187 0.0000
## xmean -0.5765 0.8797 -0.6553 0.5129
##
## $AIC
## [1] 1.000131
##
## $AICc
## [1] 1.00871
##
## $BIC
## [1] 0.04214768
xData <- c( -1.243, -0.68, 1.356, 0.843, -0.409, 1.062, -1.08, 3.002, 0.812, -0.388, -1.788, 2.321, -3.264, 0.866, -0.004, 0.289, 0.855, 1.445, -1.085, 2.426, -2.201, -1.014, 0.127, 1.326, -2.958, 2.635, -1.209, 0.288, 0.025, 2.225, -0.792, 2.58, -2.44, -1.961, 1.732, -0.62, -1.063, 1.148, -0.553, 1.192, -1.642, 0.836, 1.022, 0.844, 0.407, -1.239, -0.093, -0.918, -0.543, 0.017, 0.218, 1.895, -1.628, 1.092, 1.425, -0.962, -1.407, 0.58, 0.128, -0.509, -0.38, 0.886, -1.135, 2.319, -1.199, 2.7, 0.34, -1.393, -1.553, 1.149, 1.95, -0.563, -1.746, 2.44, -1.449, 0.306, 0.495, 2.17, 1.035, 0.186, 0.044, 0.972, -1.724, 1.314, -1.912, 1.81, 1.111, -1.517, 2.95, -1.682, 2.422, -1.526, 0.372, -0.503, -0.16, -1.42, -0.826, 1.201, 1.764, -1.759, 3.392, -0.873, 1.489, -2.768, 0.442, 0.171, -1.117, -0.757, 0.756, 0.931, -0.832, 1.028, 1.176, -0.27, 0.818, -2.096, -0.234, 0.31, -1.018, 2.883, -1.119, 0.201, -0.495, 1.506, -0.696, 0.021, 0.461, -2.817, 0.665, -0.77, 2.283, 0.635, -2.876, -0.201, 1.109, 0.666, 0.096, -0.776, -2.022, 2.101, -0.861, -1.659, 3.324, -0.428, 0.002, -0.063, 0.081, -0.034, -1.022, 0.247, -2.832, 4.967, -2.348, -1.963, 2.966, 0.317, 0.678, -1.146, -0.279, 1.632, -3.308, 1.183, 0.875, 1.941, -1.427, -1.036, 1.195, 1.425, 1.126, -3.354, 1.025, 0.976, -1.01, -1.437, 2.349, -0.452, 0.269, -0.245, -1.107, 2.442, -0.544, -0.114, -0.121, 1.017, -1.107, -0.679, 0.356, -0.535, 0.584, 1.075, -1.73, 1.321, -1.503, 0.797, -0.713, 1.599, -1.551, 1.462, -1.566, -2.094, 1.159, 1.52, 0.528, -0.48, 0.02, -0.357, 1.088, -0.936, 2.707, -0.053, -1.876, -1.162, 2.719, -0.818, -0.351, 0.459, 0.65, -0.735, 2.805, -1.153, 2.171, -0.007, -0.54, -1.186, 1.694, 0.491, -3.27, 1.605, -0.256, 0.235, 2.334, 1.164, -2.024, -0.174, 1.588, -3.079, -1.286, 2.68, -2.625, 0.28, -0.91, 0.789, 1.677, 1.291, -2.935, 0.587, 0.783, -0.749, -0.455, 1.181, -0.221, -1.713 )
x <- ts(data=xData, frequency=12, start=c(1, 1))
str(x)
## Time-Series [1:252] from 1 to 21.9: -1.243 -0.68 1.356 0.843 -0.409 ...
# Plot sample P/ACF pair to lag 60 and compare to actual
astsa::acf2(x, max.lag=60)
## ACF PACF
## [1,] -0.41 -0.41
## [2,] -0.03 -0.24
## [3,] 0.00 -0.14
## [4,] 0.06 -0.01
## [5,] -0.10 -0.10
## [6,] 0.00 -0.11
## [7,] -0.06 -0.17
## [8,] 0.04 -0.11
## [9,] 0.04 -0.02
## [10,] -0.06 -0.08
## [11,] -0.13 -0.28
## [12,] 0.46 0.33
## [13,] -0.19 0.20
## [14,] -0.03 0.12
## [15,] 0.01 0.09
## [16,] 0.01 0.02
## [17,] 0.00 0.13
## [18,] -0.14 -0.08
## [19,] 0.02 -0.08
## [20,] 0.03 -0.06
## [21,] 0.04 -0.04
## [22,] -0.07 -0.04
## [23,] 0.13 0.21
## [24,] -0.06 -0.21
## [25,] 0.06 -0.06
## [26,] -0.03 -0.01
## [27,] -0.05 -0.10
## [28,] 0.05 0.06
## [29,] 0.00 -0.03
## [30,] -0.11 0.06
## [31,] -0.03 -0.05
## [32,] 0.07 -0.03
## [33,] -0.06 -0.12
## [34,] 0.05 -0.02
## [35,] 0.06 -0.13
## [36,] -0.04 0.08
## [37,] 0.05 0.01
## [38,] -0.02 -0.03
## [39,] -0.07 0.01
## [40,] 0.07 -0.05
## [41,] -0.09 -0.09
## [42,] 0.10 0.11
## [43,] -0.11 0.05
## [44,] 0.06 0.03
## [45,] -0.08 0.03
## [46,] 0.12 0.04
## [47,] -0.03 0.16
## [48,] -0.04 -0.12
## [49,] 0.05 -0.06
## [50,] -0.04 -0.05
## [51,] 0.04 0.03
## [52,] -0.07 -0.03
## [53,] -0.07 -0.09
## [54,] 0.17 -0.01
## [55,] -0.09 0.00
## [56,] -0.03 -0.05
## [57,] 0.04 -0.03
## [58,] 0.05 -0.03
## [59,] 0.00 -0.03
## [60,] -0.07 0.07
# Fit the seasonal model to x
astsa::sarima(x, p=0, d=0, q=1, P=0, D=0, Q=1, S=12)
## initial value 0.403514
## iter 2 value 0.107253
## iter 3 value 0.063347
## iter 4 value 0.050288
## iter 5 value 0.044945
## iter 6 value 0.041690
## iter 7 value 0.041311
## iter 8 value 0.041284
## iter 9 value 0.041280
## iter 10 value 0.041271
## iter 11 value 0.041271
## iter 12 value 0.041271
## iter 12 value 0.041271
## iter 12 value 0.041271
## final value 0.041271
## converged
## initial value 0.030505
## iter 2 value 0.027716
## iter 3 value 0.026597
## iter 4 value 0.026568
## iter 5 value 0.026568
## iter 6 value 0.026568
## iter 7 value 0.026567
## iter 7 value 0.026567
## iter 7 value 0.026567
## final value 0.026567
## converged
## $fit
##
## Call:
## stats::arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D,
## Q), period = S), xreg = xmean, include.mean = FALSE, optim.control = list(trace = trc,
## REPORT = 1, reltol = tol))
##
## Coefficients:
## ma1 sma1 xmean
## -0.6142 0.7887 0.0784
## s.e. 0.0564 0.0475 0.0430
##
## sigma^2 estimated as 1.005: log likelihood = -364.27, aic = 736.54
##
## $degrees_of_freedom
## [1] 249
##
## $ttable
## Estimate SE t.value p.value
## ma1 -0.6142 0.0564 -10.8811 0.0000
## sma1 0.7887 0.0475 16.6073 0.0000
## xmean 0.0784 0.0430 1.8250 0.0692
##
## $AIC
## [1] 1.028746
##
## $AICc
## [1] 1.037325
##
## $BIC
## [1] 0.07076309
data(unemp, package="astsa")
str(unemp)
## Time-Series [1:372] from 1948 to 1979: 235 281 265 241 201 ...
# Plot unemp
plot(unemp)
# Difference your data and plot it
d_unemp <- diff(unemp)
plot(d_unemp)
# Seasonally difference d_unemp and plot it
dd_unemp <- diff(d_unemp, lag = 12)
plot(dd_unemp)
# Plot P/ACF pair of fully differenced data to lag 60
dd_unemp <- diff(diff(unemp), lag = 12)
astsa::acf2(dd_unemp, max.lag=60)
## ACF PACF
## [1,] 0.21 0.21
## [2,] 0.33 0.29
## [3,] 0.15 0.05
## [4,] 0.17 0.05
## [5,] 0.10 0.01
## [6,] 0.06 -0.02
## [7,] -0.06 -0.12
## [8,] -0.02 -0.03
## [9,] -0.09 -0.05
## [10,] -0.17 -0.15
## [11,] -0.08 0.02
## [12,] -0.48 -0.43
## [13,] -0.18 -0.02
## [14,] -0.16 0.15
## [15,] -0.11 0.03
## [16,] -0.15 -0.04
## [17,] -0.09 -0.01
## [18,] -0.09 0.00
## [19,] 0.03 0.01
## [20,] -0.01 0.01
## [21,] 0.02 -0.01
## [22,] -0.02 -0.16
## [23,] 0.01 0.01
## [24,] -0.02 -0.27
## [25,] 0.09 0.05
## [26,] -0.05 -0.01
## [27,] -0.01 -0.05
## [28,] 0.03 0.05
## [29,] 0.08 0.09
## [30,] 0.01 -0.04
## [31,] 0.03 0.02
## [32,] -0.05 -0.07
## [33,] 0.01 -0.01
## [34,] 0.02 -0.08
## [35,] -0.06 -0.08
## [36,] -0.02 -0.23
## [37,] -0.12 -0.08
## [38,] 0.01 0.06
## [39,] -0.03 -0.07
## [40,] -0.03 -0.01
## [41,] -0.10 0.03
## [42,] -0.02 -0.03
## [43,] -0.13 -0.11
## [44,] 0.00 -0.04
## [45,] -0.06 0.01
## [46,] 0.01 0.00
## [47,] 0.02 -0.03
## [48,] 0.11 -0.04
## [49,] 0.13 0.02
## [50,] 0.10 0.03
## [51,] 0.07 -0.05
## [52,] 0.10 0.02
## [53,] 0.12 0.02
## [54,] 0.06 -0.08
## [55,] 0.14 0.00
## [56,] 0.05 -0.03
## [57,] 0.04 -0.07
## [58,] 0.04 0.05
## [59,] 0.07 0.04
## [60,] -0.03 -0.04
# Fit an appropriate model
astsa::sarima(unemp, p=2, d=1, q=0, P=0, D=1, Q=1, S=12)
## initial value 3.340809
## iter 2 value 3.105512
## iter 3 value 3.086631
## iter 4 value 3.079778
## iter 5 value 3.069447
## iter 6 value 3.067659
## iter 7 value 3.067426
## iter 8 value 3.067418
## iter 8 value 3.067418
## final value 3.067418
## converged
## initial value 3.065481
## iter 2 value 3.065478
## iter 3 value 3.065477
## iter 3 value 3.065477
## iter 3 value 3.065477
## final value 3.065477
## converged
## $fit
##
## Call:
## stats::arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D,
## Q), period = S), include.mean = !no.constant, optim.control = list(trace = trc,
## REPORT = 1, reltol = tol))
##
## Coefficients:
## ar1 ar2 sma1
## 0.1351 0.2464 -0.6953
## s.e. 0.0513 0.0515 0.0381
##
## sigma^2 estimated as 449.6: log likelihood = -1609.91, aic = 3227.81
##
## $degrees_of_freedom
## [1] 369
##
## $ttable
## Estimate SE t.value p.value
## ar1 0.1351 0.0513 2.6326 0.0088
## ar2 0.2464 0.0515 4.7795 0.0000
## sma1 -0.6953 0.0381 -18.2362 0.0000
##
## $AIC
## [1] 7.12457
##
## $AICc
## [1] 7.130239
##
## $BIC
## [1] 6.156174
data(chicken, package="astsa")
str(chicken)
## Time-Series [1:180] from 2002 to 2016: 65.6 66.5 65.7 64.3 63.2 ...
# Plot differenced chicken
plot(diff(chicken))
# Plot P/ACF pair of differenced data to lag 60
astsa::acf2(diff(chicken), max.lag=60)
## ACF PACF
## [1,] 0.72 0.72
## [2,] 0.39 -0.29
## [3,] 0.09 -0.14
## [4,] -0.07 0.03
## [5,] -0.16 -0.10
## [6,] -0.20 -0.06
## [7,] -0.27 -0.19
## [8,] -0.23 0.12
## [9,] -0.11 0.10
## [10,] 0.09 0.16
## [11,] 0.26 0.09
## [12,] 0.33 0.00
## [13,] 0.20 -0.22
## [14,] 0.07 0.03
## [15,] -0.03 0.03
## [16,] -0.10 -0.11
## [17,] -0.19 -0.09
## [18,] -0.25 0.01
## [19,] -0.29 -0.03
## [20,] -0.20 0.07
## [21,] -0.08 -0.04
## [22,] 0.08 0.06
## [23,] 0.16 -0.05
## [24,] 0.18 0.02
## [25,] 0.08 -0.14
## [26,] -0.06 -0.19
## [27,] -0.21 -0.13
## [28,] -0.31 -0.06
## [29,] -0.40 -0.08
## [30,] -0.40 -0.05
## [31,] -0.33 0.01
## [32,] -0.18 0.03
## [33,] 0.02 0.10
## [34,] 0.20 0.02
## [35,] 0.30 -0.01
## [36,] 0.35 0.09
## [37,] 0.26 -0.12
## [38,] 0.13 0.01
## [39,] -0.02 -0.01
## [40,] -0.14 -0.05
## [41,] -0.23 0.02
## [42,] -0.21 0.12
## [43,] -0.18 -0.05
## [44,] -0.11 -0.13
## [45,] -0.03 -0.07
## [46,] 0.08 0.01
## [47,] 0.21 0.14
## [48,] 0.33 0.05
## [49,] 0.26 -0.20
## [50,] 0.12 -0.01
## [51,] -0.01 0.07
## [52,] -0.11 -0.04
## [53,] -0.13 0.02
## [54,] -0.09 0.00
## [55,] -0.09 -0.08
## [56,] -0.06 0.03
## [57,] 0.03 0.04
## [58,] 0.17 0.00
## [59,] 0.29 0.01
## [60,] 0.32 0.03
# Fit ARIMA(2,1,0) to chicken - not so good
astsa::sarima(chicken, p=2, d=1, q=0)
## initial value 0.001863
## iter 2 value -0.156034
## iter 3 value -0.359181
## iter 4 value -0.424164
## iter 5 value -0.430212
## iter 6 value -0.432744
## iter 7 value -0.432747
## iter 8 value -0.432749
## iter 9 value -0.432749
## iter 10 value -0.432751
## iter 11 value -0.432752
## iter 12 value -0.432752
## iter 13 value -0.432752
## iter 13 value -0.432752
## iter 13 value -0.432752
## final value -0.432752
## converged
## initial value -0.420883
## iter 2 value -0.420934
## iter 3 value -0.420936
## iter 4 value -0.420937
## iter 5 value -0.420937
## iter 6 value -0.420937
## iter 6 value -0.420937
## iter 6 value -0.420937
## final value -0.420937
## converged
## $fit
##
## Call:
## stats::arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D,
## Q), period = S), xreg = constant, optim.control = list(trace = trc, REPORT = 1,
## reltol = tol))
##
## Coefficients:
## ar1 ar2 constant
## 0.9494 -0.3069 0.2632
## s.e. 0.0717 0.0718 0.1362
##
## sigma^2 estimated as 0.4286: log likelihood = -178.64, aic = 365.28
##
## $degrees_of_freedom
## [1] 177
##
## $ttable
## Estimate SE t.value p.value
## ar1 0.9494 0.0717 13.2339 0.0000
## ar2 -0.3069 0.0718 -4.2723 0.0000
## constant 0.2632 0.1362 1.9328 0.0549
##
## $AIC
## [1] 0.1861622
##
## $AICc
## [1] 0.1985432
##
## $BIC
## [1] -0.7606218
# Fit SARIMA(2,1,0,1,0,0,12) to chicken - that works
astsa::sarima(chicken, p=2, d=1, q=0, P=1, D=0, Q=0, S=12)
## initial value 0.015039
## iter 2 value -0.226398
## iter 3 value -0.412955
## iter 4 value -0.460882
## iter 5 value -0.470787
## iter 6 value -0.471082
## iter 7 value -0.471088
## iter 8 value -0.471090
## iter 9 value -0.471092
## iter 10 value -0.471095
## iter 11 value -0.471095
## iter 12 value -0.471096
## iter 13 value -0.471096
## iter 14 value -0.471096
## iter 15 value -0.471097
## iter 16 value -0.471097
## iter 16 value -0.471097
## iter 16 value -0.471097
## final value -0.471097
## converged
## initial value -0.473585
## iter 2 value -0.473664
## iter 3 value -0.473721
## iter 4 value -0.473823
## iter 5 value -0.473871
## iter 6 value -0.473885
## iter 7 value -0.473886
## iter 8 value -0.473886
## iter 8 value -0.473886
## iter 8 value -0.473886
## final value -0.473886
## converged
## $fit
##
## Call:
## stats::arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D,
## Q), period = S), xreg = constant, optim.control = list(trace = trc, REPORT = 1,
## reltol = tol))
##
## Coefficients:
## ar1 ar2 sar1 constant
## 0.9154 -0.2494 0.3237 0.2353
## s.e. 0.0733 0.0739 0.0715 0.1973
##
## sigma^2 estimated as 0.3828: log likelihood = -169.16, aic = 348.33
##
## $degrees_of_freedom
## [1] 176
##
## $ttable
## Estimate SE t.value p.value
## ar1 0.9154 0.0733 12.4955 0.0000
## ar2 -0.2494 0.0739 -3.3728 0.0009
## sar1 0.3237 0.0715 4.5238 0.0000
## constant 0.2353 0.1973 1.1923 0.2347
##
## $AIC
## [1] 0.0842377
##
## $AICc
## [1] 0.09726452
##
## $BIC
## [1] -0.8448077
data(birth, package="astsa")
str(birth)
## Time-Series [1:373] from 1948 to 1979: 295 286 300 278 272 268 308 321 313 308 ...
# Plot P/ACF to lag 60 of differenced data
d_birth <- diff(birth)
astsa::acf2(d_birth, max.lag=60)
## ACF PACF
## [1,] -0.32 -0.32
## [2,] 0.16 0.06
## [3,] -0.08 -0.01
## [4,] -0.19 -0.25
## [5,] 0.09 -0.03
## [6,] -0.28 -0.26
## [7,] 0.06 -0.17
## [8,] -0.19 -0.29
## [9,] -0.05 -0.35
## [10,] 0.17 -0.16
## [11,] -0.26 -0.59
## [12,] 0.82 0.57
## [13,] -0.28 0.13
## [14,] 0.17 0.11
## [15,] -0.07 0.13
## [16,] -0.18 0.09
## [17,] 0.08 0.00
## [18,] -0.28 0.00
## [19,] 0.07 0.05
## [20,] -0.18 0.04
## [21,] -0.05 -0.07
## [22,] 0.16 -0.10
## [23,] -0.24 -0.20
## [24,] 0.78 0.19
## [25,] -0.27 0.01
## [26,] 0.19 0.05
## [27,] -0.08 0.07
## [28,] -0.17 0.07
## [29,] 0.07 -0.02
## [30,] -0.29 -0.06
## [31,] 0.07 -0.02
## [32,] -0.15 0.09
## [33,] -0.04 0.03
## [34,] 0.14 -0.06
## [35,] -0.24 -0.16
## [36,] 0.75 0.03
## [37,] -0.23 0.08
## [38,] 0.16 -0.10
## [39,] -0.08 -0.03
## [40,] -0.15 0.07
## [41,] 0.05 -0.04
## [42,] -0.25 0.06
## [43,] 0.06 0.04
## [44,] -0.18 -0.07
## [45,] -0.03 -0.06
## [46,] 0.15 0.02
## [47,] -0.22 -0.04
## [48,] 0.72 0.10
## [49,] -0.24 0.01
## [50,] 0.16 0.00
## [51,] -0.08 -0.03
## [52,] -0.13 0.04
## [53,] 0.05 0.03
## [54,] -0.26 0.00
## [55,] 0.05 -0.01
## [56,] -0.17 0.01
## [57,] -0.02 0.03
## [58,] 0.15 0.04
## [59,] -0.23 -0.09
## [60,] 0.70 0.04
# Plot P/ACF to lag 60 of seasonal differenced data
dd_birth <- diff(d_birth, lag = 12)
astsa::acf2(dd_birth, max.lag=60)
## ACF PACF
## [1,] -0.30 -0.30
## [2,] -0.09 -0.20
## [3,] -0.09 -0.21
## [4,] 0.00 -0.14
## [5,] 0.07 -0.03
## [6,] 0.03 0.02
## [7,] -0.07 -0.06
## [8,] -0.04 -0.08
## [9,] 0.11 0.06
## [10,] 0.04 0.08
## [11,] 0.13 0.23
## [12,] -0.43 -0.32
## [13,] 0.14 -0.06
## [14,] -0.01 -0.13
## [15,] 0.03 -0.13
## [16,] 0.01 -0.11
## [17,] 0.02 0.02
## [18,] 0.00 0.06
## [19,] 0.03 0.04
## [20,] -0.07 -0.10
## [21,] -0.01 0.02
## [22,] 0.00 0.00
## [23,] 0.06 0.17
## [24,] -0.01 -0.13
## [25,] -0.12 -0.14
## [26,] 0.17 0.07
## [27,] -0.04 -0.04
## [28,] 0.03 -0.02
## [29,] -0.05 0.02
## [30,] -0.09 -0.06
## [31,] -0.01 -0.07
## [32,] 0.19 0.05
## [33,] -0.03 0.07
## [34,] -0.09 -0.06
## [35,] -0.02 0.05
## [36,] -0.04 -0.16
## [37,] 0.17 -0.01
## [38,] -0.14 -0.04
## [39,] 0.03 -0.01
## [40,] -0.05 -0.03
## [41,] 0.03 -0.01
## [42,] 0.10 0.01
## [43,] 0.00 0.00
## [44,] -0.10 0.03
## [45,] -0.03 -0.02
## [46,] 0.06 -0.07
## [47,] 0.02 0.05
## [48,] 0.01 -0.11
## [49,] -0.01 0.05
## [50,] 0.06 0.06
## [51,] -0.08 -0.03
## [52,] 0.03 -0.03
## [53,] 0.01 0.04
## [54,] -0.02 0.02
## [55,] -0.01 -0.04
## [56,] 0.00 -0.01
## [57,] -0.07 -0.13
## [58,] 0.17 0.07
## [59,] -0.04 0.07
## [60,] -0.01 -0.05
# Fit SARIMA(0,1,1)x(0,1,1)_12. What happens?
astsa::sarima(birth, p=0, d=1, q=1, P=0, D=1, Q=1, S=12)
## initial value 2.219164
## iter 2 value 2.013310
## iter 3 value 1.988107
## iter 4 value 1.980026
## iter 5 value 1.967594
## iter 6 value 1.965384
## iter 7 value 1.965049
## iter 8 value 1.964993
## iter 9 value 1.964992
## iter 9 value 1.964992
## iter 9 value 1.964992
## final value 1.964992
## converged
## initial value 1.951264
## iter 2 value 1.945867
## iter 3 value 1.945729
## iter 4 value 1.945723
## iter 5 value 1.945723
## iter 5 value 1.945723
## iter 5 value 1.945723
## final value 1.945723
## converged
## $fit
##
## Call:
## stats::arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D,
## Q), period = S), include.mean = !no.constant, optim.control = list(trace = trc,
## REPORT = 1, reltol = tol))
##
## Coefficients:
## ma1 sma1
## -0.4734 -0.7861
## s.e. 0.0598 0.0451
##
## sigma^2 estimated as 47.4: log likelihood = -1211.28, aic = 2428.56
##
## $degrees_of_freedom
## [1] 371
##
## $ttable
## Estimate SE t.value p.value
## ma1 -0.4734 0.0598 -7.9097 0
## sma1 -0.7861 0.0451 -17.4227 0
##
## $AIC
## [1] 4.869388
##
## $AICc
## [1] 4.874924
##
## $BIC
## [1] 3.890415
# Add AR term and conclude
astsa::sarima(birth, p=1, d=1, q=1, P=0, D=1, Q=1, S=12)
## initial value 2.218186
## iter 2 value 2.032584
## iter 3 value 1.982464
## iter 4 value 1.975643
## iter 5 value 1.971721
## iter 6 value 1.967284
## iter 7 value 1.963840
## iter 8 value 1.961106
## iter 9 value 1.960849
## iter 10 value 1.960692
## iter 11 value 1.960683
## iter 12 value 1.960675
## iter 13 value 1.960672
## iter 13 value 1.960672
## iter 13 value 1.960672
## final value 1.960672
## converged
## initial value 1.940459
## iter 2 value 1.934425
## iter 3 value 1.932752
## iter 4 value 1.931750
## iter 5 value 1.931074
## iter 6 value 1.930882
## iter 7 value 1.930860
## iter 8 value 1.930859
## iter 8 value 1.930859
## final value 1.930859
## converged
## $fit
##
## Call:
## stats::arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D,
## Q), period = S), include.mean = !no.constant, optim.control = list(trace = trc,
## REPORT = 1, reltol = tol))
##
## Coefficients:
## ar1 ma1 sma1
## 0.3038 -0.7006 -0.8000
## s.e. 0.0865 0.0604 0.0441
##
## sigma^2 estimated as 45.91: log likelihood = -1205.93, aic = 2419.85
##
## $degrees_of_freedom
## [1] 370
##
## $ttable
## Estimate SE t.value p.value
## ar1 0.3038 0.0865 3.5104 5e-04
## ma1 -0.7006 0.0604 -11.5984 0e+00
## sma1 -0.8000 0.0441 -18.1302 0e+00
##
## $AIC
## [1] 4.842869
##
## $AICc
## [1] 4.848523
##
## $BIC
## [1] 3.87441
data(unemp, package="astsa")
str(unemp)
## Time-Series [1:372] from 1948 to 1979: 235 281 265 241 201 ...
# Fit your previous model to unemp and check the diagnostics
astsa::sarima(unemp, p=2, d=1, q=0, P=0, D=1, Q=1, S=12)
## initial value 3.340809
## iter 2 value 3.105512
## iter 3 value 3.086631
## iter 4 value 3.079778
## iter 5 value 3.069447
## iter 6 value 3.067659
## iter 7 value 3.067426
## iter 8 value 3.067418
## iter 8 value 3.067418
## final value 3.067418
## converged
## initial value 3.065481
## iter 2 value 3.065478
## iter 3 value 3.065477
## iter 3 value 3.065477
## iter 3 value 3.065477
## final value 3.065477
## converged
## $fit
##
## Call:
## stats::arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D,
## Q), period = S), include.mean = !no.constant, optim.control = list(trace = trc,
## REPORT = 1, reltol = tol))
##
## Coefficients:
## ar1 ar2 sma1
## 0.1351 0.2464 -0.6953
## s.e. 0.0513 0.0515 0.0381
##
## sigma^2 estimated as 449.6: log likelihood = -1609.91, aic = 3227.81
##
## $degrees_of_freedom
## [1] 369
##
## $ttable
## Estimate SE t.value p.value
## ar1 0.1351 0.0513 2.6326 0.0088
## ar2 0.2464 0.0515 4.7795 0.0000
## sma1 -0.6953 0.0381 -18.2362 0.0000
##
## $AIC
## [1] 7.12457
##
## $AICc
## [1] 7.130239
##
## $BIC
## [1] 6.156174
# Forecast the data 3 years into the future
astsa::sarima.for(unemp, p=2, d=1, q=0, P=0, D=1, Q=1, S=12, n.ahead=36)
## $pred
## Jan Feb Mar Apr May Jun Jul
## 1979 676.4664 685.1172 653.2388 585.6939 553.8813 664.4072 647.0657
## 1980 683.3045 687.7649 654.8658 586.1507 553.9285 664.1108 646.6220
## 1981 682.6406 687.0977 654.1968 585.4806 553.2579 663.4398 645.9508
## Aug Sep Oct Nov Dec
## 1979 611.0828 594.6414 569.3997 587.5801 581.1833
## 1980 610.5345 594.0427 568.7684 586.9320 580.5249
## 1981 609.8632 593.3713 568.0970 586.2606 579.8535
##
## $se
## Jan Feb Mar Apr May Jun Jul
## 1979 21.20465 32.07710 43.70167 53.66329 62.85364 71.12881 78.73590
## 1980 116.99599 124.17344 131.51281 138.60466 145.49706 152.12863 158.52302
## 1981 194.25167 201.10648 208.17066 215.11503 221.96039 228.64285 235.16874
## Aug Sep Oct Nov Dec
## 1979 85.75096 92.28663 98.41329 104.19488 109.67935
## 1980 164.68623 170.63839 176.39520 181.97333 187.38718
## 1981 241.53258 247.74268 253.80549 259.72970 265.52323
data(chicken, package="astsa")
str(chicken)
## Time-Series [1:180] from 2002 to 2016: 65.6 66.5 65.7 64.3 63.2 ...
# Fit the chicken model again and check diagnostics
astsa::sarima(chicken, p=2, d=1, q=0, P=1, D=0, Q=0, S=12)
## initial value 0.015039
## iter 2 value -0.226398
## iter 3 value -0.412955
## iter 4 value -0.460882
## iter 5 value -0.470787
## iter 6 value -0.471082
## iter 7 value -0.471088
## iter 8 value -0.471090
## iter 9 value -0.471092
## iter 10 value -0.471095
## iter 11 value -0.471095
## iter 12 value -0.471096
## iter 13 value -0.471096
## iter 14 value -0.471096
## iter 15 value -0.471097
## iter 16 value -0.471097
## iter 16 value -0.471097
## iter 16 value -0.471097
## final value -0.471097
## converged
## initial value -0.473585
## iter 2 value -0.473664
## iter 3 value -0.473721
## iter 4 value -0.473823
## iter 5 value -0.473871
## iter 6 value -0.473885
## iter 7 value -0.473886
## iter 8 value -0.473886
## iter 8 value -0.473886
## iter 8 value -0.473886
## final value -0.473886
## converged
## $fit
##
## Call:
## stats::arima(x = xdata, order = c(p, d, q), seasonal = list(order = c(P, D,
## Q), period = S), xreg = constant, optim.control = list(trace = trc, REPORT = 1,
## reltol = tol))
##
## Coefficients:
## ar1 ar2 sar1 constant
## 0.9154 -0.2494 0.3237 0.2353
## s.e. 0.0733 0.0739 0.0715 0.1973
##
## sigma^2 estimated as 0.3828: log likelihood = -169.16, aic = 348.33
##
## $degrees_of_freedom
## [1] 176
##
## $ttable
## Estimate SE t.value p.value
## ar1 0.9154 0.0733 12.4955 0.0000
## ar2 -0.2494 0.0739 -3.3728 0.0009
## sar1 0.3237 0.0715 4.5238 0.0000
## constant 0.2353 0.1973 1.1923 0.2347
##
## $AIC
## [1] 0.0842377
##
## $AICc
## [1] 0.09726452
##
## $BIC
## [1] -0.8448077
# Forecast the chicken data 5 years into the future
astsa::sarima.for(chicken, p=2, d=1, q=0, P=1, D=0, Q=0, S=12, n.ahead=60)
## $pred
## Jan Feb Mar Apr May Jun Jul
## 2016
## 2017 110.5358 110.5612 110.5480 110.7055 111.0047 111.1189 111.1552
## 2018 111.8108 111.9782 112.1330 112.3431 112.5991 112.7952 112.9661
## 2019 114.1331 114.3464 114.5556 114.7827 115.0247 115.2473 115.4617
## 2020 116.7942 117.0224 117.2492 117.4819 117.7193 117.9505 118.1790
## 2021 119.5651 119.7980 120.0306 120.2650 120.5010 120.7350 120.9681
## Aug Sep Oct Nov Dec
## 2016 111.0907 110.8740 110.6853 110.5045 110.5527
## 2017 111.1948 111.2838 111.3819 111.4825 111.6572
## 2018 113.1380 113.3260 113.5168 113.7085 113.9242
## 2019 115.6765 115.8965 116.1174 116.3386 116.5675
## 2020 118.4077 118.6380 118.8686 119.0993 119.3326
## 2021
##
## $se
## Jan Feb Mar Apr May Jun
## 2016
## 2017 3.7414959 4.1793190 4.5747009 4.9373266 5.2742129 5.5903499
## 2018 8.2010253 8.5605811 8.9054714 9.2372195 9.5572539 9.8667955
## 2019 12.0038164 12.2921541 12.5738417 12.8492868 13.1188976 13.3830477
## 2020 15.1557253 15.3959082 15.6323906 15.8653300 16.0948844 16.3212022
## 2021 17.8397890 18.0473081 18.2524651 18.4553364 18.6559977 18.8545213
## Jul Aug Sep Oct Nov Dec
## 2016 0.6187194 1.3368594 2.0462419 2.6867986 3.2486625
## 2017 5.8893133 6.2367345 6.6253573 7.0309771 7.4344077 7.8255932
## 2018 10.1668604 10.4736807 10.7857727 11.0980056 11.4063211 11.7085266
## 2019 13.6420693 13.9002670 14.1573839 14.4122197 14.6638269 14.9117124
## 2020 16.5444204 16.7657634 16.9852163 17.2025022 17.4174076 17.6298379
## 2021 19.0509752
Chapter 1 - Introduction to Bayesian Thinking
Discrete probability distributions - two schools of thought, frequentist and Bayesian:
Bayes’ rule - Presbyterian minister Thomas Bayes was a mathematician in his spare time:
Sequential Bayes - the posterior after the first trial becomes the prior for sequential trials:
Example code includes:
# Define a spinner with five regions: regions
regions <- c(1, 1, 1, 1, 1)
# Plot the spinner
TeachBayes::spinner_plot(regions)
# Show the probability distribution
TeachBayes::spinner_probs(regions)
## Region Prob
## 1 1 0.2
## 2 2 0.2
## 3 3 0.2
## 4 4 0.2
## 5 5 0.2
# Define new spinner: regions
regions <- c(2, 2, 4)
# Simulation 1000 spins: spins
spins <- TeachBayes::spinner_data(regions, nsim=1000)
# Graph the spin data using bar_plot()
TeachBayes::bar_plot(spins)
# Construct frequency table of spins
table(spins)
## spins
## 1 2 3
## 238 241 521
# Find fraction of spins equal to 2
mean(spins == 2)
## [1] 0.241
# Find mean spin value
mean(spins)
## [1] 2.283
# Create the vector of models: Model
Model <- c("Spinner A", "Spinner B")
# Define the vector of prior probabilities: Prior
Prior <- c(0.5, 0.5)
# Define the vector of likelihoods: Likelihood
Likelihood <- c(1/2, 1/6)
# Make a data frame with variables Model, Prior, Likelihood: bayes_df
bayes_df <- data.frame(Model, Prior, Likelihood, stringsAsFactors=FALSE)
str(bayes_df)
## 'data.frame': 2 obs. of 3 variables:
## $ Model : chr "Spinner A" "Spinner B"
## $ Prior : num 0.5 0.5
## $ Likelihood: num 0.5 0.167
# Compute the posterior probabilities
TeachBayes::bayesian_crank(bayes_df)
## Model Prior Likelihood Product Posterior
## 1 Spinner A 0.5 0.5000000 0.25000000 0.75
## 2 Spinner B 0.5 0.1666667 0.08333333 0.25
TeachBayes::prior_post_plot( TeachBayes::bayesian_crank(bayes_df) )
# Display the vector of models: Model
Model <- c("Spinner A", "Spinner B")
# Define the vector of prior probabilities: Prior
Prior <- c(0.75, 0.25)
# Define the vector of likelihoods: Likelihood
Likelihood <- c(1/2, 1/6)
# Make a data frame with variables Model, Prior, Likelihood: bayes_df
bayes_df <- data.frame(Model, Prior, Likelihood, stringsAsFactors=FALSE)
str(bayes_df)
## 'data.frame': 2 obs. of 3 variables:
## $ Model : chr "Spinner A" "Spinner B"
## $ Prior : num 0.75 0.25
## $ Likelihood: num 0.5 0.167
# Compute the posterior probabilities
TeachBayes::bayesian_crank(bayes_df)
## Model Prior Likelihood Product Posterior
## 1 Spinner A 0.75 0.5000000 0.37500000 0.9
## 2 Spinner B 0.25 0.1666667 0.04166667 0.1
Chapter 2 - Binomial Probability
Bayes with discrete models - example of “percentage, p, of people who prefer discrete time period X for activity Y”:
Bayes with continuous priors - continuing with the example of “percentage, p, of people who prefer discrete time period X for activity Y”:
Updating the beta prior - the product of the beta-curve prior and the binomial likelihoods is again a beta-curve:
Bayesian inference - all inferences are based on various summarizations of the posterior beta-curve:
Posterior simulation - can simulate from the posterior probability using rbeta():
Example code includes:
# Define the values of the proportion: P
P <- c(0.5, 0.6, 0.7, 0.8, 0.9)
# Define Madison's prior: Prior
Prior <- c(0.3, 0.3, 0.2, 0.1, 0.1)
# Compute the likelihoods: Likelihood
Likelihood <- dbinom(16, size=20, prob=P)
# Create Bayes data frame: bayes_df
bayes_df <- data.frame(P, Prior, Likelihood)
str(bayes_df)
## 'data.frame': 5 obs. of 3 variables:
## $ P : num 0.5 0.6 0.7 0.8 0.9
## $ Prior : num 0.3 0.3 0.2 0.1 0.1
## $ Likelihood: num 0.00462 0.03499 0.13042 0.2182 0.08978
# Compute the posterior probabilities: bayes_df
bayes_df <- TeachBayes::bayesian_crank(bayes_df)
str(bayes_df)
## 'data.frame': 5 obs. of 5 variables:
## $ P : num 0.5 0.6 0.7 0.8 0.9
## $ Prior : num 0.3 0.3 0.2 0.1 0.1
## $ Likelihood: num 0.00462 0.03499 0.13042 0.2182 0.08978
## $ Product : num 0.00139 0.0105 0.02608 0.02182 0.00898
## $ Posterior : num 0.0202 0.1527 0.3793 0.3173 0.1306
# Graphically compare the prior and posterior
TeachBayes::prior_post_plot(bayes_df)
# Find the probability that P is smaller than 0.85
pbeta(0.85, 8.13, 3.67)
## [1] 0.9000721
# Find the probability that P is larger than 0.85
pbeta(0.85, 8.13, 3.67, lower.tail=FALSE)
## [1] 0.09992792
# Find the 0.75 quantile of P
qbeta(0.75, 8.13, 3.67)
## [1] 0.785503
# Specify that the 0.25 quantile of P is equal to 0.7: quantile1
quantile1 <- list(p=0.25, x=0.7)
# Specify that the 0.75 quantile of P is equal to 0.85: quantile2
quantile2 <- list(p=0.75, x=0.85)
# Find the beta shape parameters matching the two quantiles: ab
ab <- LearnBayes::beta.select(quantile1, quantile2)
# Plot the beta curve using the beta_draw() function
TeachBayes::beta_draw(ab)
# Harry's shape parameters for his prior: ab
ab <- c(3, 3)
# Vector of successes and failures: sf
sf <- c(16, 4)
# Harry's shape parameters for his posterior: ab_new
ab_new <- ab + sf
# Graph Harry's posterior
TeachBayes::beta_draw(ab_new)
# Vector of beta parameters for Harry: ab
ab <- c(19, 7)
# Compute probability that P is smaller than 0.70
pbeta(0.7, ab[1], ab[2])
## [1] 0.3406549
# Show the area that is computed
TeachBayes::beta_area(0, 0.7, ab)
# Vector of beta parameters for Harry: ab
ab <- c(19, 7)
# Compute 90 percent interval
qbeta(c(0.05, 0.95), ab[1], ab[2])
## [1] 0.5804800 0.8605247
# Show the interval that is computed
TeachBayes::beta_interval(0.9, ab)
classical_binom_ci <-function(y, n, conf.level = 0.95){
s <- y + 2
f <- n - y + 2
n_new <- n + 4
phat <- s / n_new
se <- sqrt(phat * (1 - phat) / n_new)
z <- qnorm(1 - (1 - conf.level) / 2)
c(phat - z * se, phat + z * se)
}
# Define the number of successes and sample size: y, n
y <- 16
n <- 20
# Construct a 90 percent confidence interval
classical_binom_ci(y=y, n=n, conf.level=0.9)
## [1] 0.6046141 0.8953859
# Define the shape parameters for a uniform prior: ab
ab <- c(1, 1)
# Find the shape parameters of the posterior: ab_new
ab_new <- ab + c(y, n-y)
# Find a 90% Bayesian probability interval
TeachBayes::beta_interval(0.9, ab_new)
qbeta(c(0.05, 0.95), ab_new[1], ab_new[2])
## [1] 0.6155919 0.9011565
# Vector of beta parameters for Harry: ab
ab <- c(19, 7)
# Simulate 1000 draws from the beta posterior: p_sim
p_sim <- rbeta(1000, ab[1], ab[2])
# Construct a histogram of the simulated values
hist(p_sim, freq=FALSE)
# Compute the probability that P is larger than 0.7
mean(p_sim > 0.7)
## [1] 0.669
# Find a 90% probability interval
quantile(p_sim, c(0.05, 0.95))
## 5% 95%
## 0.5821422 0.8556054
# Vector of beta parameters for Harry: ab
ab <- c(19, 7)
# Simulate 1000 draws from the beta posterior: p_sim
p_sim <- rbeta(1000, ab[1], ab[2])
# Compute the odds-ratio: or_sim
or_sim <- p_sim / (1 - p_sim)
# Construct a histogram of the simulated values of or_sim
hist(or_sim, freq=FALSE)
# Find the probability the odds ratio is greater than 2
mean(or_sim > 2)
## [1] 0.79
# Find a 90% probability interval for the odds ratio
quantile(or_sim, c(0.05, 0.95))
## 5% 95%
## 1.406177 6.168665
Chapter 3 - Normal mean
Normal sampling model - Roger Federer “serving efficiency” examples:
Bayes with a continuous prior - same example assuming normal distribution with mean M and sd s:
Updating the normal prior - suppose a starting prior for 18 +/- 1.56 (Mo +/- So):
Simulation - can take the Posterior M and S and run simulations using rnorm:
Example code includes:
# Place possible values of M in a vector: Model
Model <- seq(250, 290, by = 10)
# Construct a uniform probability vector: Prior1
Prior1 <- rep(0.2, 5)
# Graph the prior using function prob_plot()
TeachBayes::prob_plot(data.frame(Model, Prior1))
# Construct a different probability distribution: Prior2
Prior2 <- c(0.3, 0.3, 0.2, 0.1, 0.1)
# Graph the prior using function prob_plot()
TeachBayes::prob_plot(data.frame(Model, Prior2))
# Define models and prior: M, Prior
M <- seq(250, 290, by = 10)
Prior <- rep(.2, 5)
# Collect observations
times <- c(240, 267, 308, 275, 271,
268, 258, 295, 315, 262)
# Compute ybar and standard error
ybar <- mean(times); n <- 10
sigma <- 20; se <- sigma / sqrt(n)
# Compute likelihoods using dnorm(): Likelihood
Likelihood <- dnorm(ybar, mean=M, sd=se)
# Collect the vectors M, Prior, Likelihood in a data frame: bayes_df
bayes_df <- data.frame(M, Prior, Likelihood)
# Use bayesian_crank to compute the posterior probabilities: bayes_df
bayes_df <- TeachBayes::bayesian_crank(bayes_df)
# Use prior_post_plot() to graph the prior and posterior probabilities
TeachBayes::prior_post_plot(bayes_df)
# Specify the 0.02 quantile of M: quantile1
quantile1 <- list(p=0.02, x=240)
# Specify the 0.60 quantile of M: quantile2
quantile2 <- list(p=0.6, x=280)
# Find the normal parameters that match the two quantiles
normal_par <- LearnBayes::normal.select(quantile1, quantile2)
# Plot the normal curve using the normal_draw() function
TeachBayes::normal_draw(normal_par)
# Collect observations
times <- c(240, 267, 308, 275, 271,
268, 258, 295, 315, 262)
# Compute ybar and standard error
ybar <- mean(times)
sigma <- 20; se <- sigma / sqrt(10)
# Define mean and standard error: Data
Data <- c(ybar, se)
# Define mean and standard deviation of prior: Prior
Prior <- c(260, 10)
# Use normal_update() function: Posterior
Posterior <- TeachBayes::normal_update(Prior, Data)
# Construct plot of prior and posterior
TeachBayes::many_normal_plots(list(Prior, Posterior))
# Define mean and standard error: Data
Data <- c(275.9, 6.32)
# Compute 90% confidence interval: C_Interval
C_Interval <- Data[1] + c(-1, 1) * 1.645 * Data[2]
# Find the length of the confidence interval
diff(C_Interval)
## [1] 20.7928
# Define mean and standard deviation of posterior: Posterior
Posterior <- c(271.35, 5.34)
# Display a 90% probability interval
TeachBayes::normal_interval(prob=0.90, Posterior)
# Compute the 90% probability interval: B_Interval
B_Interval <- qnorm(p=c(0.05, 0.95), mean=271.35, sd=5.34)
# Compute the length of the Bayesian interval
diff(B_Interval)
## [1] 17.56704
# Simulate 1000 values from the posterior curve: M_sim
M_sim <- rnorm(1000, 270.5, 5.8)
# Compute the posterior standard deviation
sd(M_sim)
## [1] 5.785342
# Compute the probability that M is smaller than 260
mean(M_sim < 260)
## [1] 0.022
# Find a 70 percent probability interval for M
quantile(M_sim, c(0.15, 0.85))
## 15% 85%
## 264.3010 276.6793
# Simulate 1000 draws from John's posterior density: M_sim
M_sim <- rnorm(1000, 270.5, 5.8)
# Simulate 1000 draws from the predictive density: y_sim
y_sim <- rnorm(1000, M_sim, 20)
# Compute the probability I score less than 250
mean(y_sim < 250)
## [1] 0.183
# Find a 90 percent prediction interval for my score
quantile(y_sim, c(0.05, 0.95))
## 5% 95%
## 235.2359 302.0141
Chapter 4 - Bayesian Comparisons
Comparing two proportions - multiple parameters rather than just a single proportion or a single mean:
Proportions with continuous priors - continuing with the exercise examples with pW and pM:
Normal model inference - modeling when both the mean M and the standard deviation S are unknown:
Bayesian regression - example of looking at “how much slower does Rafa serve than Roger”?
Example code includes:
# Define a uniform prior on all 25 pairs: prior
prior <- TeachBayes::testing_prior(0.1, 0.9, 5, uniform=TRUE)
# Display the prior matrix
prior
## 0.1 0.3 0.5 0.7 0.9
## 0.1 0.04 0.04 0.04 0.04 0.04
## 0.3 0.04 0.04 0.04 0.04 0.04
## 0.5 0.04 0.04 0.04 0.04 0.04
## 0.7 0.04 0.04 0.04 0.04 0.04
## 0.9 0.04 0.04 0.04 0.04 0.04
# Graph the prior
TeachBayes::draw_two_p(prior)
# Find the probability distribution of pN - pS: d_NS
d_NS <- TeachBayes::two_p_summarize(prior)
# Graph this distribution
TeachBayes::prob_plot(d_NS)
# Define a uniform prior on all 25 pairs: prior
prior <- TeachBayes::testing_prior(0.1, 0.9, 5, uniform = TRUE)
# Define the data: s1f1, s2f2
s1f1 <- c(12, 8)
s2f2 <- c(17, 3)
# Compute the posterior: post
post <- TeachBayes::two_p_update(prior, s1f1, s2f2)
# Graph the posterior
TeachBayes::draw_two_p(post)
# Find the probability distribution of pN - pS: d_NS
d_NS <- TeachBayes::two_p_summarize(post)
# Graph this distribution
TeachBayes::prob_plot(d_NS)
# Simulate 1000 values from the prior on pS: sim_pS
sim_pS <- rbeta(1000, 4.91, 3.38)
# Simulate 1000 values from the prior on pN: sim_pN
sim_pN <- rbeta(1000, 4.91, 3.38)
# For each pair of proportions, compute the difference: d_NS
d_NS <- sim_pN - sim_pS
# Plot a histogram of the values in d_NS
hist(d_NS)
# Find the probability d_NS is positive
mean(d_NS > 0)
## [1] 0.491
# Find a 90% probability interval for d_NS
quantile(d_NS, c(0.05, 0.95))
## 5% 95%
## -0.3784893 0.3888031
# Define the number of successes and number of failures: s1f1, s2f2
s1f1 <- c(12, 8)
s2f2 <- c(17, 3)
# Find the prior beta shape parameters for pS and pN:
pS_prior <- c(1, 1)
pN_prior <- c(1, 1)
# Find the posterior beta shape parameters for pS: pS_shape
pS_shape <- pS_prior + s1f1
# Find the posterior beta shape parameters for pN: pN_shape
pN_shape <- pN_prior + s2f2
# Simulate 1000 draws from the posterior: sim_pS, sim_pN
sim_pS <- rbeta(1000, pS_shape[1], pS_shape[2])
sim_pN <- rbeta(1000, pN_shape[1], pN_shape[2])
# Construct a scatterplot of the posterior
plot(sim_pS, sim_pN)
# Simulate 1000 draws from the posterior: sim_pS, sim_pN
sim_pS <- rbeta(1000, 13, 9)
sim_pN <- rbeta(1000, 18, 4)
# For each pair of proportions, compute the ratio: r_NS
r_NS <- sim_pN / sim_pS
# Plot a histogram of the values in r_NS
hist(r_NS)
# Find the probability r_NS is larger than 1
mean(r_NS > 1)
## [1] 0.959
# Find a 80% probability interval for r_NS
quantile(r_NS, c(0.1, 0.9))
## 10% 90%
## 1.096503 1.866527
# Collect reaction times: times
times <- c(240, 267, 308, 275, 271,
268, 258, 295, 315, 262)
# Fit a normal model: fit
fit <- lm(times ~ 1)
# Simulate 1000 from posterior: sim_fit
sim_fit <- arm::sim(fit, n.sims=1000)
# Extract the simulated values of M and S: M_sim, s_sim
M_sim <- coef(sim_fit)
S_sim <- arm::sigma.hat(sim_fit)
# Construct a scatterplot of simulated values
plot(M_sim, S_sim)
# Collect reaction times: times
times <- c(240, 267, 308, 275, 271,
268, 258, 295, 315, 262)
# Fit a normal model: fit
fit <- lm(times ~ 1)
# Simulate 1000 from posterior: sim_fit
sim_fit <- arm::sim(fit, n.sims = 1000)
# Extract the simulated values of M and S: M_sim, s_sim
M_sim <- coef(sim_fit)
s_sim <- arm::sigma.hat(sim_fit)
# Compute values of the 75th percentile: Q75
Q75 <- M_sim + 0.674 * s_sim
# Construct histogram of the posterior of Q75
hist(Q75)
# Find a 70% probability interval for Q75
quantile(Q75, c(0.15, 0.85))
## 15% 85%
## 283.9219 302.2872
ddTime <- c( 240, 267, 308, 275, 271, 268, 258, 295, 315, 262, 279, 241, 225, 252, 288, 242, 281, 254, 263, 276 )
ddPerson <- rep(c("Jim", "Steven"), each=10)
dd <- data.frame(Person=factor(ddPerson), Time=ddTime)
# Perform a regression fit of Time with Person as a covariate: fit
fit <- lm(Time ~ Person, data = dd)
# Simulate 1000 values from the posterior distribution: sim_fit
sim_fit <- arm::sim(fit, n.sims=1000)
# Extract simulated draws of beta and S: beta_sim, s_sim
beta_sim <- coef(sim_fit)
s_sim <- arm::sigma.hat(sim_fit)
# Construct a scatterplot of the posterior distribution of (beta0, beta1)
plot(beta_sim[, 1], beta_sim[, 2])
# Perform a regression fit of Time with Person as a covariate: fit
fit <- lm(Time ~ Person, data = dd)
# Simulate 1000 values from the posterior distribution: sim_fit
sim_fit <- arm::sim(fit, n.sims = 1000)
# Extract simulated draws of beta and S: beta_sim, s_sim
beta_sim <- coef(sim_fit)
s_sim <- arm::sigma.hat(sim_fit)
# Compute simulated values of the standardized change: s_delta
s_delta <- beta_sim[,2] / s_sim
# Find 90% interval estimate for s_delta
quantile(s_delta, c(0.05, 0.95))
## 5% 95%
## -1.42941624 0.06782187